setwd("/Users/kelseyschoenemann/Desktop/Code_Datasets")
library(readxl) #read in data
library(readr)
library(tidyverse) #manipulate the data
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) #plot data
# install.packages("devtools")
# devtools::install_github("r-lib/conflicted")
library(conflicted) #https://conflicted.r-lib.org
conflicts_prefer(dplyr::filter)
## [conflicted] Will prefer dplyr::filter over any other package.
#read in capture log of queens identity and behavior
CapLog <- read.csv("CaptureLog.csv")
CapLog[,c(13:15)][is.na(CapLog[,c(13:15)])] <- 0 #replacing NAs with 0s
CapLog$Bombus_spp<-as.factor(CapLog$Bombus_spp)
CapLog$BeeID<-as.factor(CapLog$BeeID)
CapLog$SiteID<-as.factor(CapLog$SiteID)
CapLog$Date<-as.POSIXct(CapLog$Date, format="%m/%e/%y")
#read in site data about survey conditions
SitDat <- read.csv("SiteData.csv")
SitDat[,c(10:13)][is.na(SitDat[,c(10:13)])] <- 0 #replacing NAs with 0s
SitDat$SiteID<-as.factor(SitDat$SiteID)
#read in survey effort data about search times by habitat and observer
Effort <- read.csv("SurveyEffort.csv")
Effort$SiteID<-as.factor(Effort$SiteID)
FlorSpp <- read.csv("FloweringSpp.csv")
FlorSpp$SiteID<-as.factor(FlorSpp$SiteID)
INFO about zonal statistics (i.e., how exactly the data I’m using was tabulated): https://gis.stackexchange.com/questions/276794/how-does-qgis-zonal-statistics-handle-partially-overlapping-pixels
#read in land use data extracted near survey sites
SitesLandUse <- read.csv("export-Sites.csv")
SitesLandUse <- SitesLandUse[,-c(1,5)] #remove columns that aren't Name, SiteID, Type, County or land use columns
SitesLandUse[,7:21][is.na(SitesLandUse[,7:21])] <- 0 #replace NAs in land use columns with 0
# rename columns
#LAND USE CLASSES
#NLCD
# ClassID Classification
# 11 Open Water
# 21 Developed, Open Space
# 22 Developed, Low Intensity
# 23 Developed, Medium Intensity
# 24 Developed High Intensity
# 31 Barren Land (Rock/Sand/Clay)
# 41 Deciduous Forest
# 42 Evergreen Forest
# 43 Mixed Forest
# 52 Shrub/Scrub
# 71 Grassland/Herbaceous
# 81 Pasture/Hay
# 82 Cultivated Crops
# 90 Woody Wetlands
# 95 Emergent Herbaceous Wetlands
SitesLandUse2 <- SitesLandUse %>% rename(
'NLCD_Water1000'='NLCD_Sites_1000m_11',
'NLCD_DevelopedOpen1000'='NLCD_Sites_1000m_21',
'NLCD_DevelopedLow1000'='NLCD_Sites_1000m_22',
'NLCD_DevelopedMed1000'='NLCD_Sites_1000m_23',
'NLCD_DevelopedHigh1000'='NLCD_Sites_1000m_24',
'NLCD_Barren1000'='NLCD_Sites_1000m_31',
'NLCD_DeciduousForest1000'='NLCD_Sites_1000m_41',
'NLCD_EvergreenForest1000'='NLCD_Sites_1000m_42',
'NLCD_MixedForest1000'='NLCD_Sites_1000m_43',
'NLCD_ShrubScrub1000'='NLCD_Sites_1000m_52',
'NLCD_Grassland1000'='NLCD_Sites_1000m_71',
'NLCD_PastureHay1000'='NLCD_Sites_1000m_81',
'NLCD_CultivatedCrops1000'='NLCD_Sites_1000m_82',
'NLCD_WoodyWetlands1000'='NLCD_Sites_1000m_90',
'NLCD_HerbaceousWetlands1000'='NLCD_Sites_1000m_95'
)
rm(SitesLandUse)
SitesLandUse2[,7:21] <- lapply(SitesLandUse2[,7:21], as.numeric) #change values to numeric
SitesLandUse2 <- SitesLandUse2 %>%
rowwise() %>% mutate(TotalCells1000_NLCD = rowSums(across('NLCD_Water1000':'NLCD_HerbaceousWetlands1000'))) #create new columns that equal the sum of all cells within 1000m of site centerpoint
#read in tables of ASV assignments for each sample
#(datasets called 'long' because each sample spans multiple rows, one for each ASV within a sample)
#for rbcL
rbcL_IDs <- read.csv("rbcL.IDs.long.csv")
rbcL_IDs$Sample <- as.factor(rbcL_IDs$Sample)
#repeat for ITS2
ITS2_IDs <- read.csv("ITS2.IDs.long.csv")
ITS2_IDs[ITS2_IDs=="s_NA"]<-NA #explicitly call these text strings NA values
ITS2_IDs[ITS2_IDs=="g_NA"]<-NA
ITS2_IDs$Sample <- as.factor(ITS2_IDs$Sample)
#bind the rbcL and ITS2 sequence taxonomies
PollenIDs<-rbind(
rbcL_IDs %>%
select(Sample, SampleTotalReads, Reads, Family, Genus, Species) %>%
mutate(Family=str_extract(Family, "(?<=f__)[a-zA-Z\\s]+(?=_)"),
Genus=str_extract(Genus, "(?<=g__)[a-zA-Z\\s]+(?=_)"), #extracting character strings so taxa names will match up with those in ITS2 dataset
Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+(?=_)"), #god bless chatgpt for this one doggamm
Barcode="rbcL") %>%
filter(SampleTotalReads>999) #filter out samples with less than 1000 reads in total
, # <-comma for rbind(rbcl,ITS2)
# Explanation of the Regex
# (?<=s__) - This is a lookbehind assertion that ensures the match occurs after "s__".
# [a-zA-Z\\s]+ - This matches one or more letters (both uppercase and lowercase) and spaces. (the + is important!)
# (?=_) - This is a lookahead assertion that ensures the match occurs before an underscore (_).
ITS2_IDs %>%
select(Sample, SampleTotalReads, Reads, Family, Genus, Species) %>%
mutate(Family=str_extract(Family, "(?<=f_)[a-zA-Z\\s]+"),
Genus=str_extract(Genus, "(?<=g_)[a-zA-Z\\s]+"), #character string a little diff for ITS2
Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+"),
Barcode="ITS2") %>%
filter(SampleTotalReads>999) #filter out samples with less than 1000 reads in total
) %>%
mutate(BeeID=Sample, #rename variable
#Barcode_Sample=paste(Barcode,Sample,sep = "_"), #create new variable with Barcode and Sample name appended together
Genus_species=paste(Genus,Species,sep = "_")) %>%
select(BeeID,Barcode,SampleTotalReads,Family,Genus,Species,Genus_species, Reads) %>%
mutate(PropReads=Reads/SampleTotalReads) %>%
# the data are still presented as ASVs (meaning multiple ASVs could be assigned to the same Genus/species, so now we will sum the reads for each ASV assigned to the same taxon for each sample)
group_by(across(BeeID:Genus_species)) %>% # group by sample and taxonomic assignment
summarise(across(Reads:PropReads, ~ sum(.x ))) %>% # and sum reads (or prop)
mutate(Family = # correcting out-dated taxonomic names inherited from rbcL reference database (affecting rbcL samples only)
ifelse(.data$Family=="Aceraceae", "Sapindaceae", #change Acer../Hipp.. to Sapi..
ifelse(.data$Family=="Hippocastanaceae", "Sapindaceae",
ifelse(.data$Genus=="Nyssa", "Nyssaceae", #change Corn.. to Nyss (I only have two species from Cornaceae in my data, Nyssa sylvatica, which is now in Nyssaceae family, and Cornus spp, which remains in Cornaceae family); link: https://doi.org/10.1111/boj.12385
.data$Family))))
## `summarise()` has grouped output by 'BeeID', 'Barcode', 'SampleTotalReads',
## 'Family', 'Genus', 'Species'. You can override using the `.groups` argument.
PollenIDs <- as.data.frame(PollenIDs) # revert tibble to dataframe
rm(ITS2_IDs, rbcL_IDs)
# generate summary table of queens captured by site-visit, behavior, pollen status
CapLog.sum <- CapLog %>% #
group_by(SiteID) %>% ## at each site,
summarise(CapFly=sum(Flying), ## sum number of bees captured while flying
CapForag=sum(Forage), ## ...or foraging
CapNest=sum(NestSeek), ## ..or nest-seeking
TotalCaptured=sum(Flying,Forage,NestSeek),
PollenBees=sum(CarryPollen))
# generate summary table of queens captured by site-visit and Bombus species
CapLog.spp <- CapLog %>%
group_by(SiteID, Bombus_spp) %>%
tally() %>%
pivot_wider(names_from = Bombus_spp, values_from = n) %>%
select(SiteID,bimaculatus,griseocollis,impatiens,auricomus,'vagans-sandersoni-perplexus',fervidus,pensylvanicus)
# generate summary table of start/end times and temps and observed tallys of queens by survey-date
## (this step was needed because Dave would sometimes do 2+ surveys at Blandy in a day, recording site conditions each time)
SitDat.sum <- as.data.frame(
bind_rows(
SitDat %>% filter(Name!="Blandy Experimental Farm"),
SitDat %>% filter(Name=="Blandy Experimental Farm") %>% group_by(Date) %>%
dplyr::summarize(SiteID=first(SiteID), Name=first(Name), Date=first(Date),
across('StartTimeH':'StartTimeM', list(dplyr::first)),
across('EndTimeH':'EndTempC', list(dplyr::last)),
across('DurationM', list(sum)),
across('CountFlying':'TotalObserved',list(sum))) %>%
'colnames<-'(c("Date","SiteID","Name", colnames(SitDat[,4:13]))) #renaming the summarized columns so they will match up with the original, un-summarized non-Blandy records
))
# The above code grabs rows from SitDat for sites OTHER than Blandy and combines (bind_rows) with the following:
# A summary of rows from SitDat for ONLY Blandy, grouped by Date, where I want
# first Date, Name, and SiteID
# the first StartTime, and last EndTime and EndTemp
# the sum of survey Duration
# the sum of the Count columns (which are expected to be zero for Blandy, since Dave didn't count bees not caught)
rm(SitDat)
# generate summary table of search time
Effort.sum <- Effort %>% group_by(SiteID) %>% #group by site
summarise(SearchTime=sum(SearchTime))
SiteEffort <- left_join(SitDat.sum,Effort.sum,by="SiteID")
SiteEffort$Date<-as.POSIXct(SiteEffort$Date, format="%d-%b-%y")
rm(Effort)
FlorSpp.sum <- FlorSpp %>% group_by(SiteID) %>%
summarize(FloralRichness=n())
###SiteLandUse.sum, SiteLandUse.sum2
# generate summary table of land cover near survey sites with *simplified* land cover classes (sum)
# expressed as proportional cover (rather than numbers of cells) ----
SitesLandUse.sum <- SitesLandUse2 %>%
mutate(across('NLCD_Water1000':'NLCD_HerbaceousWetlands1000',~.x/TotalCells1000_NLCD)) %>%
## ^^ divide all values ('~.x') by TotalCells for the given dataset
mutate(
#simplify land use categories
NLCD1000_Water = NLCD_Water1000,
NLCD1000_Developed = sum(NLCD_DevelopedOpen1000 + NLCD_DevelopedLow1000 + NLCD_DevelopedMed1000 + NLCD_DevelopedHigh1000),
NLCD1000_Forest = sum(NLCD_DeciduousForest1000 + NLCD_MixedForest1000 + NLCD_WoodyWetlands1000 + NLCD_ShrubScrub1000 + NLCD_EvergreenForest1000), #shrubscrub = forest bc in my dataset it is rare, and is sometimes misclassified (when tree farms, fields with sparse tree), but it usually has trees, so forest seems appropriate
NLCD1000_Herbaceous= sum(NLCD_Grassland1000 + NLCD_PastureHay1000 + NLCD_HerbaceousWetlands1000 + NLCD_Barren1000), #barren = grassland bc I checked on the few pixels in my dataset and it was a mis-assignment in the original land cover data (the aerial imagery clearly shows grassy field where barren was assigned)
NLCD1000_Crop = NLCD_CultivatedCrops1000
) %>%
select(Name:Updated_y,NLCD1000_Water:NLCD1000_Crop)
rowSums(SitesLandUse.sum[,c(7:11)]) #OH YAY they all equal one
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
summary(SitesLandUse.sum[,-c(1:6)])
## NLCD1000_Water NLCD1000_Developed NLCD1000_Forest NLCD1000_Herbaceous
## Min. :0.0000000 Min. :0.002023 Min. :0.01677 Min. :0.02255
## 1st Qu.:0.0002889 1st Qu.:0.058071 1st Qu.:0.18507 1st Qu.:0.11276
## Median :0.0018768 Median :0.095361 Median :0.27444 Median :0.25808
## Mean :0.0111786 Mean :0.261119 Mean :0.38044 Mean :0.30326
## 3rd Qu.:0.0060707 3rd Qu.:0.429609 3rd Qu.:0.56432 3rd Qu.:0.45714
## Max. :0.0988154 Max. :0.852227 Max. :0.90546 Max. :0.68091
## NLCD1000_Crop
## Min. :0.0000000
## 1st Qu.:0.0000000
## Median :0.0004338
## Mean :0.0440006
## 3rd Qu.:0.0417973
## Max. :0.3327551
SiteLandUseSummary <- as.data.frame(SitesLandUse.sum[,-c(1:6)]) %>%
pivot_longer(.,cols=starts_with("NLCD"), names_to = "names", values_to = "value") %>%
select(names,value) %>%
separate_wider_delim(., names, delim="_", names = c("scale","class")) %>%
mutate_at("value", as.numeric) %>%
mutate_at("class", as.factor)
ggplot(SiteLandUseSummary, aes(x=class,y=value))+
geom_boxplot()+
ylab("Proportion of Area") +
xlab("Land Cover Type") +
theme_bw() +
theme(
axis.title = element_text(size=18),
axis.text = element_text(size=12))
rm(SiteLandUseSummary)
boxplot(x = as.list(as.data.frame(SitesLandUse.sum[,-c(1:6)])))
# create a summary of pollen community for each barcode-sample at the genus level
# within a sample, multiple ASVs may have assigned to the same taxonomy, so we sum across those records
PollenIDgenera.sum <- PollenIDs %>%
group_by(Barcode, BeeID, Genus) %>% #group by barcode, sample/BeeID, and assigned genus
summarize(PropReads=sum(PropReads)) # sum the proportion of reads
## `summarise()` has grouped output by 'Barcode', 'BeeID'. You can override using
## the `.groups` argument.
# create a list of bees/samples with data from both barcodes
TwoBarcodeBees<-PollenIDgenera.sum %>%
select(BeeID, Barcode) %>%
distinct() %>% group_by(BeeID) %>%
summarise(nBarcode=n()) %>%
filter(nBarcode==2) %>% select(BeeID)
# create a summary of pollen community for each sample at the genus level (average of both barcodes)
PollenIDgenera.comm <- PollenIDs %>%
group_by(BeeID, Genus) %>% # NOT GROUPING BY BARCODE (mean prop reads across barcodes)
summarize(PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1))) %>%
pivot_wider(., names_from = Genus, values_from = PropReads, values_fill = 0)
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
# create list of genera appearing in a single sample
singletonGenera<-as.data.frame(
PollenIDs %>%
group_by(Genus) %>%
summarize(n_bees=n_distinct(BeeID)) %>%
filter(n_bees==1) %>%
select(Genus)
)$Genus
dat.Sites <-
left_join(SiteEffort, CapLog.sum, by='SiteID') %>% #join site-effort to capture log summary
left_join(., CapLog.spp, by='SiteID') %>% #join to that the capture log species summary
#install fuzzyjoin & comparator
mutate(across(bimaculatus:pensylvanicus, ~replace_na(.x, 0))) %>%
left_join(., SitesLandUse.sum, by = "Name") %>%
select(-"SiteID.y") %>% #remove redundant column
rename("SiteID"="SiteID.x") %>% #fix column name messed up by joining
mutate(across(CountFlying:pensylvanicus, ~replace_na(.,0))) %>% #replace NAs with 0s
#the current column (" . ")
left_join(.,FlorSpp.sum,by='SiteID') %>% #join floral richness
mutate(FloralRichness=if_else(is.na(FloralRichness)&Name!="Blandy Experimental Farm",0,FloralRichness))%>%
mutate(Bees_60min=((TotalObserved+TotalCaptured)/SearchTime)*60, #calculate bees per 60-min survey
logBees_60min=log(Bees_60min+1)) #calculate log() of that value
dat.Queen <-left_join(CapLog,PollenIDgenera.comm,by="BeeID") %>% #join capture log with pollen summary (both barcodes averaged)
filter(!is.na(Cercis)) #filter rows without metabarcoding data
#rm(PollenIDgenera.comm)
#this dataset has Bombus_spp identity and keeps data from the two barcodes separate
dat<- left_join(
PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
dat.Queen %>% select(BeeID, Bombus_spp),
by="BeeID") %>%
ungroup()
# table of count of ITS2, rbcL, both samples by Bombus species
dat %>% group_by(BeeID, Bombus_spp, Barcode) %>% summarise(count=n()) %>%
pivot_wider(., names_from = "Barcode", values_from = "count") %>%
mutate(across(ITS2:rbcL, ~replace_na(.x, 0))) %>%
reframe(BeeID=first(BeeID), Bombus_spp=first(Bombus_spp), ITS2=sum(ITS2), rbcL=sum(rbcL), both=ifelse(.data$ITS2 + .data$rbcL==2, "Y", "N")) %>%
group_by(Bombus_spp, both) %>% summarize(totalITS2=sum(ITS2), totalrbcL=sum(rbcL)) %>%
pivot_wider(., names_from = "both", values_from = c(totalITS2, totalrbcL)) %>%
rename("ITS2"="totalITS2_N", "rbcL"="totalrbcL_N", "both"="totalITS2_Y") %>%
select(ITS2, rbcL, both) %>%
mutate(across(ITS2:both, ~replace_na(.x, 0)))
## `summarise()` has grouped output by 'BeeID', 'Bombus_spp'. You can override
## using the `.groups` argument.
## `summarise()` has grouped output by 'Bombus_spp'. You can override using the
## `.groups` argument.
## Adding missing grouping variables: `Bombus_spp`
## # A tibble: 7 × 4
## # Groups: Bombus_spp [7]
## Bombus_spp ITS2 rbcL both
## <fct> <int> <int> <int>
## 1 auricomus 0 1 2
## 2 bimaculatus 6 22 28
## 3 fervidus 0 5 3
## 4 griseocollis 4 4 18
## 5 impatiens 0 0 11
## 6 pensylvanicus 0 0 1
## 7 vagans-sandersoni-perplexus 0 1 1
# how many barcode-samples
dim(PollenIDs %>%
group_by(BeeID, Barcode) %>%
summarize(
n=n()
)) # three columns (BeeID, Barcode, n or number of records), and 171 BeeID-Barcode groups
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
## [1] 171 3
# average and sd reads per barcode-sample
PollenIDs %>%
group_by(BeeID, Barcode) %>%
summarise(
sumReads=sum(Reads)
) %>%
group_by(Barcode) %>%
summarise(
mean=mean(sumReads),
sd=stats::sd(sumReads),
min=min(sumReads),
max=max(sumReads)
)
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 5
## Barcode mean sd min max
## <chr> <dbl> <dbl> <int> <int>
## 1 ITS2 4579. 3604. 1062 16430
## 2 rbcL 9284. 5380. 1161 22161
# count of pollen samples by month and site
left_join(PollenIDgenera.comm %>% select(BeeID),
dat.Queen %>% select(BeeID, SiteID),
by="BeeID") %>%
left_join(.,
dat.Sites %>% select(SiteID, Name, Date),
by="SiteID") %>%
mutate(SimpName=ifelse(.data$Name=="Blandy Experimental Farm", Name, "All Other"),
Month=lubridate::month(Date)) %>%
group_by(Month, SimpName) %>% summarize(n=n()) %>%
pivot_wider(names_from = Month, values_from = n, values_fill = 0)
## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 4
## SimpName `3` `4` `5`
## <chr> <int> <int> <int>
## 1 All Other 2 50 0
## 2 Blandy Experimental Farm 0 36 19
#total reads
PollenIDs %>%
summarise(Reads=sum(Reads))
## Reads
## 1 1239407
#number of reads UNassigned to Species, by barcode
NA_reads<-PollenIDs %>%
group_by(Barcode, Species) %>%
summarise(Reads=sum(Reads)) %>%
filter(is.na(Species))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
#number of reads assigned to Species, by barcode
Assigned_reads<-PollenIDs %>%
group_by(Barcode, Species) %>%
summarise(Reads=sum(Reads)) %>%
filter(!is.na(Species)) %>%
group_by(Barcode) %>%
summarise(Reads=sum(Reads))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
TotalReads<-NA_reads$Reads+Assigned_reads$Reads
#percent of reads *ASSIGNED* to Species, by barcode
(Assigned_reads$Reads / TotalReads) * 100 #ITS2 appears first (alphabetical)
## [1] 37.60540 73.45585
#number of reads UNassigned to Genus, by barcode
NA_reads<-PollenIDs %>%
group_by(Barcode, Genus) %>%
summarise(Reads=sum(Reads)) %>%
filter(is.na(Genus))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
#number of reads assigned to Genus, by barcode
Assigned_reads<-PollenIDs %>%
group_by(Barcode, Genus) %>%
summarise(Reads=sum(Reads)) %>%
filter(!is.na(Genus)) %>%
group_by(Barcode) %>%
summarise(Reads=sum(Reads))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
TotalReads<-NA_reads$Reads+Assigned_reads$Reads
#percent of reads *ASSIGNED* to genus, by barcode
(Assigned_reads$Reads / TotalReads) * 100 #ITS2 appears first (alphabetical)
## [1] 99.16266 89.68581
#number of reads UNassigned to Family, by barcode
NA_reads<-PollenIDs %>%
group_by(Barcode, Family) %>%
summarise(Reads=sum(Reads)) %>%
filter(is.na(Family))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
#number of reads assigned to Family, by barcode
Assigned_reads<-PollenIDs %>%
group_by(Barcode, Family) %>%
summarise(Reads=sum(Reads)) %>%
filter(!is.na(Family)) %>%
group_by(Barcode) %>%
summarise(Reads=sum(Reads))
## `summarise()` has grouped output by 'Barcode'. You can override using the
## `.groups` argument.
TotalReads<-NA_reads$Reads+Assigned_reads$Reads
#percent of reads *ASSIGNED* to Family, by barcode
(Assigned_reads$Reads / TotalReads) * 100 #ITS2 appears first (alphabetical)
## [1] 99.16266 89.68581
rm(NA_reads, Assigned_reads, TotalReads)
#rarefaction of pollen species richness and read depth within each barcode-sample
dat<-PollenIDs %>% # (2 barcodes kept separate) # Genus_species
group_by(Barcode, BeeID, Genus_species) %>%
summarize(Reads=sum(Reads)) %>%
filter(Genus_species!="NA_NA") %>%
pivot_wider(names_from = Genus_species, values_from = Reads, values_fill = 0) %>%
# mutate(rowname=paste(.data$Barcode, .data$BeeID, sep="_")) %>%
ungroup() %>% select(-c(BeeID, Barcode))
## `summarise()` has grouped output by 'Barcode', 'BeeID'. You can override using
## the `.groups` argument.
dat<-as.data.frame(dat)
dat <- dat[rowSums(dat) > 0, ]
vegan::rarecurve(dat, step=20, tidy=T) %>%
ggplot(aes(x=Sample, y=Species, col=Site))+
geom_line()+
labs(x="Read Depth", y="Species detected", col="")+
theme(legend.position = "none") +
guides(color="none")+
lims(x=c(0,5000),y=c(0,25))+
theme_bw(base_size = 18)
## Warning: Removed 24853 rows containing missing values or values outside the scale range
## (`geom_line()`).
#rarefaction of pollen species richness and sample size within each of the most common bombus species
dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, Bombus_spp, Date), by="BeeID") %>%
filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens") %>%
# filter(.data$Genus %in% topGenera2$Genus) %>% #optional filter for most common genera
group_by(Bombus_spp, Genus) %>% #first summarize across BeeID within Bombus_spp, pollen Genus
summarize(nSamples=n_distinct(BeeID)) %>%
filter(!is.na(Genus)) %>%
pivot_wider(names_from = Genus, values_from = nSamples, values_fill = 0) %>%
ungroup() %>% select(-c(Bombus_spp))
## `summarise()` has grouped output by 'Bombus_spp'. You can override using the
## `.groups` argument.
dat<-as.data.frame(dat)
dat <- dat[rowSums(dat) > 0, ]
vegan::rarecurve(dat, step=3, tidy=T) %>%
mutate(Site=case_match(Site,
"1"~"bimaculatus",
"2"~"griseocollis",
"3"~"impatiens"))%>%
ggplot(aes(x=Sample, y=Species, col=Site))+
geom_line()+
labs(x="Number of Pollen Samples", y="Genera detected", col="Bombus spp")+
theme_bw(base_size = 18)
rank_abundance = PollenIDs %>%
select(BeeID, Barcode, Family, Genus, PropReads, SampleTotalReads) %>%
arrange(BeeID, Barcode, desc(PropReads)) %>%
group_by(BeeID, Barcode) %>%
mutate(abundance=PropReads,
rank = rank(desc(PropReads), ties.method= "random")) %>%
mutate(rank=factor(rank)) %>%
filter(!is.na(Family) & !is.na(Genus)) %>%
ungroup()
summary(rank_abundance)
## BeeID Barcode Family Genus
## Bg011 : 31 Length:1363 Length:1363 Length:1363
## KLS0201: 26 Class :character Class :character Class :character
## Bg005 : 25 Mode :character Mode :character Mode :character
## Bf001 : 24
## Bb003 : 23
## Bi002 : 23
## (Other):1211
## PropReads SampleTotalReads abundance rank
## Min. :0.0000602 Min. : 1062 Min. :0.0000602 1 :161
## 1st Qu.:0.0031286 1st Qu.: 3836 1st Qu.:0.0031286 2 :150
## Median :0.0136636 Median : 7020 Median :0.0136636 3 :138
## Mean :0.1158430 Mean : 8516 Mean :0.1158430 4 :121
## 3rd Qu.:0.0868255 3rd Qu.:12842 3rd Qu.:0.0868255 5 :106
## Max. :1.0000000 Max. :22161 Max. :1.0000000 6 : 90
## (Other):597
# plot with rank abundance curve
rank_abundance %>%
ggplot(aes(x = rank,
y = abundance)) +
geom_boxplot(aes(fill=Barcode), position = position_dodge(preserve = "single"))+
theme_bw(base_size=18)+
scale_x_discrete(breaks = function(x){x[c(TRUE, FALSE)]})+
facet_wrap(~Barcode) +
labs(x="Abundance Rank", y="Proportion of Reads")
# overall genera richness of samples
PollenIDs %>%
group_by(BeeID)%>%
summarise(GeneraRichness=n_distinct(Genus)) %>%
summarize(mean=mean(GeneraRichness))
## # A tibble: 1 × 1
## mean
## <dbl>
## 1 10.6
# by barcode, genera richness of samples
PollenIDs %>%
group_by(BeeID,Barcode)%>%
summarise(GeneraRichness=n_distinct(Genus)) %>%
group_by(Barcode) %>%
summarize(mean=mean(GeneraRichness), median=stats::median(GeneraRichness), low=min(GeneraRichness), high=max(GeneraRichness))
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 5
## Barcode mean median low high
## <chr> <dbl> <dbl> <int> <int>
## 1 ITS2 4.16 4 1 10
## 2 rbcL 9.72 9 1 21
# plot a histogram of barcod-samples' genera richness
PollenIDs %>%
group_by(BeeID,Barcode)%>%
summarise(GeneraRichness=n_distinct(Genus)) %>%
ggplot(aes(x=GeneraRichness)) +
geom_histogram()+
facet_wrap(~Barcode)
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
rm(rank_abundance)
# counting genera in pollen data
PollenIDs %>% distinct(Genus) # 94 total genera detected (plus 1 for NA/unassigned reads)
## Genus
## 1 Prunus
## 2 Lonicera
## 3 Cercis
## 4 <NA>
## 5 Paulownia
## 6 Pinus
## 7 Platanus
## 8 Salix
## 9 Viburnum
## 10 Liquidambar
## 11 Brassica
## 12 Hesperis
## 13 Elaeagnus
## 14 Glechoma
## 15 Anthoxanthum
## 16 Viola
## 17 Mertensia
## 18 Quercus
## 19 Dicentra
## 20 Lupinus
## 21 Morus
## 22 Lamium
## 23 Polemonium
## 24 Chamaecyparis
## 25 Veronica
## 26 Taraxacum
## 27 Cardamine
## 28 Potentilla
## 29 Acer
## 30 Asimina
## 31 Vinca
## 32 Berberis
## 33 Cunninghamia
## 34 Lolium
## 35 Poa
## 36 Buglossoides
## 37 Medicago
## 38 Cynoglossum
## 39 Euscaphis
## 40 Mummenhoffia
## 41 Exochorda
## 42 Akebia
## 43 Magnolia
## 44 Pyrus
## 45 Trifolium
## 46 Ginkgo
## 47 Picea
## 48 Barbarea
## 49 Packera
## 50 Halesia
## 51 Podophyllum
## 52 Fraxinus
## 53 Dactylis
## 54 Robinia
## 55 Wisteria
## 56 Geum
## 57 Thlaspi
## 58 Stylophorum
## 59 Virgilia
## 60 Ranunculus
## 61 Syringa
## 62 Crataegus
## 63 Erigeron
## 64 Leucojum
## 65 Chelidonium
## 66 Photinia
## 67 Liriodendron
## 68 Fontanesia
## 69 Nyssa
## 70 Aesculus
## 71 Rosa
## 72 Gleditsia
## 73 Celastrus
## 74 Vitis
## 75 Securigera
## 76 Pulmonaria
## 77 Lamprocapnos
## 78 Aronia
## 79 Pachysandra
## 80 Gelsemium
## 81 Scilla
## 82 Malus
## 83 Vaccinium
## 84 Fagus
## 85 Vicia
## 86 Toxicodendron
## 87 Impatiens
## 88 Ligustrum
## 89 Gaylussacia
## 90 Cornus
## 91 Alliaria
## 92 Rubus
## 93 Anemone
## 94 Isatis
## 95 Spiraea
## 96 Phleum
#list of all singletonGenera (genera detected in only one sample, by either barcode)
singletonGenera<-as.data.frame(
PollenIDs %>%
group_by(Genus) %>%
summarize(n_bees=n_distinct(BeeID)) %>%
filter(n_bees==1) %>%
select(Genus)
)$Genus
# identify most common pollen by applying filters on genera-level abunance metrics
temp<-PollenIDs %>%
group_by(BeeID, Genus) %>%
summarize(
PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)),
Barcodes=paste(unique(.data$Barcode), collapse = ", "),
Reads=sum(Reads)) %>%
group_by(Genus) %>%
summarize(Barcodes=Barcodes[which.max(str_length(Barcodes))],
nSamples=n_distinct(BeeID),
meanPropReads=mean(PropReads),
maxPropReads=max(PropReads),
Reads=sum(Reads)) %>%
mutate(Barcodes=recode(Barcodes, 'ITS2, rbcL'="both", "rbcL, ITS2" = "both")) %>%
as.data.frame() %>%
# filter(.data$Genus %in% topGenera1$Genus) %>%
filter(nSamples>5) %>% # filter by number of samples
filter(maxPropReads>0.25) %>% # filter genera with >25% max prop reads w/in a sample
filter(meanPropReads>0.025) %>% #and filter genera with >2.5% mean prop reads
pivot_longer(., cols=c(nSamples:Reads),
names_to = "metric",
values_to = "value") %>%
mutate(metric=factor(metric, levels=c("nSamples", "meanPropReads", "maxPropReads", "Reads"))) %>%
drop_na(Genus)
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
temp %>% filter(metric=="Reads") %>% mutate(topGeneraTotalReads=sum(value)) %>% select(topGeneraTotalReads) %>% distinct() # 947562 reads assigned to top genera
## # A tibble: 1 × 1
## topGeneraTotalReads
## <dbl>
## 1 947562
947562/sum(PollenIDs$Reads) #~76% of ALL reads belong to one of the top genera
## [1] 0.7645285
PollenIDs %>% filter(is.na(Genus)) %>% mutate(NAReads=sum(Reads)) %>% select(NAReads) %>% distinct() #95726 unassigned (NA) reads
## NAReads
## 1 95726
947562/(sum(PollenIDs$Reads)-95726) #~83% of assigned reads belong to one of the top genera
## [1] 0.8285195
topGenera2<-temp$Genus %>% as_tibble() %>% distinct() %>% as.data.frame()
colnames(topGenera2)<-"Genus"
temp %>%
ggplot() +
geom_tile(data = temp %>% filter(metric=="nSamples") %>% droplevels,
aes(metric, Genus, fill=value)) +
geom_text(data = temp %>% filter(metric=="nSamples") %>% droplevels,
aes(metric, Genus,
label=format(value, digits=2),
size=4)) +
scale_fill_gradientn(name = "nSamples", colors=c("white","gold")) +
ggnewscale::new_scale_fill() +
geom_tile(data = temp %>% filter(metric=="meanPropReads") %>% droplevels,
aes(metric, Genus, fill=value)) +
geom_text(data = temp %>% filter(metric=="meanPropReads") %>% droplevels,
aes(metric, Genus,
label=format(value, digits=1),
size=4)) +
scale_fill_gradientn(name = "meanPropReads", colors=c("white","gold")) +
ggnewscale::new_scale_fill() +
geom_tile(data = temp %>% filter(metric=="maxPropReads") %>% droplevels,
aes(metric, Genus, fill=value)) +
geom_text(data = temp %>% filter(metric=="maxPropReads") %>% droplevels,
aes(metric, Genus,
label=format(value, digits=2),
size=4)) +
scale_fill_gradientn(name = "maxPropReads", colors=c("white","gold")) +
scale_y_discrete(limits=rev,
labels=paste(sort(unique(temp$Genus), decreasing = T),
paste0("(", (temp %>%
group_by(Genus) %>%
summarize(Barcodes=first(Barcodes)) %>%
arrange(desc(Genus))
)$Barcodes,
")"), sep=" ")) + # alternate:, sep="\n"
scale_x_discrete(limits = c("nSamples","meanPropReads","maxPropReads"),
labels = c("Number of \n Samples \n (total)",
"Proportion of \n Sample Reads \n (mean)",
"Proportion of \n Sample Reads \n (max)")) +
ylab("Pollen Genus (detected by)") +
theme_bw(base_size = 18)+
theme(legend.position = "none",
axis.title.x = element_text(size=0))
#calculating the min and max values of metrics
PollenIDs %>%
group_by(BeeID, Genus) %>%
summarize(
PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)),
Barcodes=paste(unique(.data$Barcode), collapse = ", "),
Reads=sum(Reads)) %>%
group_by(Genus) %>%
summarize(Barcodes=Barcodes[which.max(str_length(Barcodes))],
nSamples=n_distinct(BeeID),
Reads=sum(Reads),
meanPropReads=round(mean(PropReads),digits=2),
minPropReads=round(min(PropReads),digits=2),
maxPropReads=round(max(PropReads),digits=2),
sdPropReads=round(stats::sd(PropReads),digits=2)) %>%
mutate(Barcodes=recode(Barcodes, 'ITS2, rbcL'="both", "rbcL, ITS2" = "both")) %>%
as.data.frame() %>%
filter(nSamples>5) %>% # filter by number of samples
filter(maxPropReads>0.25) %>% # filter genera with >25% max prop reads w/in a sample
filter(meanPropReads>0.025) #and filter genera with >2.5% mean prop reads
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
## Genus Barcodes nSamples Reads meanPropReads minPropReads
## 1 Acer both 36 8197 0.03 0.00
## 2 Aesculus both 6 32238 0.40 0.04
## 3 Barbarea rbcL 7 13453 0.16 0.00
## 4 Cercis both 44 174003 0.26 0.00
## 5 Elaeagnus rbcL 31 20845 0.05 0.00
## 6 Glechoma rbcL 29 40676 0.11 0.00
## 7 Halesia ITS2 12 26793 0.31 0.00
## 8 Lamium both 46 135135 0.25 0.00
## 9 Liriodendron rbcL 19 19506 0.10 0.00
## 10 Lonicera rbcL 28 64660 0.20 0.00
## 11 Mertensia ITS2 25 19491 0.14 0.00
## 12 Mummenhoffia ITS2 13 5597 0.07 0.00
## 13 Paulownia both 10 13567 0.11 0.00
## 14 Pinus rbcL 85 57281 0.06 0.00
## 15 Prunus both 37 81502 0.21 0.00
## 16 Quercus both 30 9702 0.05 0.00
## 17 Salix rbcL 41 107905 0.19 0.00
## 18 Taraxacum ITS2 19 6118 0.07 0.00
## 19 Vaccinium both 9 19670 0.14 0.00
## 20 Viburnum both 29 10381 0.04 0.00
## 21 Viola both 26 80842 0.14 0.00
## 22 <NA> both 92 95726 0.09 0.00
## maxPropReads sdPropReads
## 1 0.65 0.11
## 2 0.77 0.29
## 3 0.64 0.24
## 4 0.98 0.37
## 5 0.28 0.08
## 6 0.48 0.14
## 7 1.00 0.40
## 8 0.98 0.32
## 9 0.99 0.25
## 10 0.96 0.29
## 11 0.47 0.18
## 12 0.36 0.10
## 13 0.50 0.18
## 14 0.43 0.09
## 15 1.00 0.35
## 16 0.27 0.07
## 17 0.99 0.31
## 18 0.31 0.10
## 19 0.68 0.24
## 20 0.38 0.10
## 21 0.84 0.26
## 22 0.70 0.14
rm(temp)
# colors based on this palette
# colorBlindness::SteppedSequential5Steps
myColors.FamGen <- c("#990F0F", "#B22C2C" , "#CC5151" , "#E57E7E" , "#FFB2B2" , "#99540F" , "#B26F2C" , "#CC8E51" , "#E5B17E" , "#FFD8B2" , "#6B990F" , "#85B22C" , "#A3CC51" , "#C3E57E" , "#E5FFB2" , "#0F6B99" , "#2C85B2" , "#51A3CC" , "#7EC3E5" , "#B2E5FF" , "#260F99" , "#8F7EE5", "black")
names(myColors.FamGen) <-
sort(c(levels(
factor(c((PollenIDs %>% filter(.data$Genus %in% topGenera2$Genus) %>% mutate(FamGen=paste(Family, Genus, sep=" ")) %>% arrange(Family) %>% select(FamGen))$FamGen,"z_AllOther"),
as.character(unique(c((PollenIDs %>% filter(.data$Genus %in% topGenera2$Genus) %>% mutate(FamGen=paste(Family, Genus, sep=" ")) %>% arrange(Family) %>% select(FamGen))$FamGen,"z_AllOther"))))
), "z_Shared"))
myColors.topGen <- c("#990F0F", "#B22C2C" , "#CC5151" , "#E57E7E" , "#FFB2B2" , "#99540F" , "#B26F2C" , "#CC8E51" , "#E5B17E" , "#FFD8B2" , "#6B990F" , "#85B22C" , "#A3CC51" , "#C3E57E" , "#E5FFB2" , "#0F6B99" , "#2C85B2" , "#51A3CC" , "#7EC3E5" , "#B2E5FF" , "#260F99" , "#8F7EE5" , "#BFB2FF")
names(myColors.topGen) <-
levels(
factor(c((PollenIDs %>% filter(.data$Genus %in% topGenera2$Genus) %>% mutate(FamGen=paste(Family, Genus, sep=" ")) %>% arrange(Family) %>% select(Genus))$Genus, "z_AllOther"),
as.character(unique(c((PollenIDs %>% filter(.data$Genus %in% topGenera2$Genus) %>% mutate(FamGen=paste(Family, Genus, sep=" ")) %>% arrange(Family) %>% select(Genus))$Genus, "z_AllOther"))))
)
#https://stackoverflow.com/questions/25098107/how-to-order-the-levels-of-factors-according-to-the-ordering-of-a-data-frame-an
head(PollenIDs %>%
filter(Genus %in% topGenera2$Genus) %>%
filter(!is.na(Species)) %>%
group_by(Genus, Barcode) %>%
summarise(
first=first(Species),
n_spp=n_distinct(Species)))
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 4
## # Groups: Genus [4]
## Genus Barcode first n_spp
## <chr> <chr> <chr> <int>
## 1 Acer ITS2 negundo 4
## 2 Acer rbcL negundo 3
## 3 Aesculus ITS2 x marilandica 2
## 4 Aesculus rbcL hippocastanum 1
## 5 Barbarea rbcL vulgaris 1
## 6 Cercis rbcL siliquastrum 1
PollenIDs %>%
filter(Genus %in% topGenera2$Genus) %>%
filter(!is.na(Species)) %>%
group_by(Genus, Barcode) %>%
summarise(
first=first(Species),
n_spp=n_distinct(Species)) %>%
group_by(Barcode) %>%
summarize(mean=mean(n_spp), min=min(n_spp), max=max(n_spp))
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.
## # A tibble: 2 × 4
## Barcode mean min max
## <chr> <dbl> <int> <int>
## 1 ITS2 2 1 4
## 2 rbcL 1.6 1 4
#floral species common at sites
head(left_join(FlorSpp, dat.Sites %>% select(SiteID, Name, Date), by="SiteID"))
## SiteID Family Genus Species Common Name
## 1 WCP1 Rosaceae Rubus spp Bramble Walnut Creek Park
## 2 WCP1 Fabaceae Cercis canadensis Eastern Redbud Walnut Creek Park
## 3 WCP1 Lauraceae Lindera benzoin Spicebush Walnut Creek Park
## 4 WCP1 Magnoliaceae Liriodendron tulipifera Tulip Poplar Walnut Creek Park
## 5 WCP1 Sapindaceae Acer spp Maple Walnut Creek Park
## 6 WCP1 Violaceae Viola spp Violet Walnut Creek Park
## Date
## 1 2022-03-21
## 2 2022-03-21
## 3 2022-03-21
## 4 2022-03-21
## 5 2022-03-21
## 6 2022-03-21
head(FlorSpp %>%
group_by(Genus, SiteID) %>%
summarise(n=n()) %>%
group_by(Genus) %>%
summarise(n=n(), prop=n/50) %>% #divide by 50 site visits (25 sites visited twice)
arrange(-n))
## `summarise()` has grouped output by 'Genus'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## Genus n prop
## <chr> <int> <dbl>
## 1 Lamium 35 0.7
## 2 Viola 35 0.7
## 3 Taraxacum 32 0.64
## 4 Glechoma 28 0.56
## 5 Barbarea 19 0.38
## 6 Elaeagnus 19 0.38
# floral species richness over time (mean species/site per week)
left_join(FlorSpp, dat.Sites %>% select(SiteID, Name, Date), by="SiteID") %>%
mutate(Genus_spp=paste(Genus, Species, sep="_")) %>%
group_by(SiteID) %>%
summarise(
nGenus_spp=n_distinct(Genus_spp),
) %>% ungroup() %>%
left_join(., dat.Sites %>% select(SiteID, Name, Date), by="SiteID") %>%
group_by(Week=lubridate::week(.data$Date)) %>%
summarise(weekOf=first(Date),
meannGenus_spp=mean(nGenus_spp),
minnGenus_spp=min(nGenus_spp),
maxnGenus_spp=max(nGenus_spp),
n=n()
)
## # A tibble: 7 × 6
## Week weekOf meannGenus_spp minnGenus_spp maxnGenus_spp n
## <dbl> <dttm> <dbl> <int> <int> <int>
## 1 12 2022-03-22 00:00:00 7.22 5 10 9
## 2 13 2022-03-30 00:00:00 7.17 5 11 6
## 3 14 2022-04-04 00:00:00 8.75 3 13 8
## 4 15 2022-04-12 00:00:00 9.38 5 14 8
## 5 16 2022-04-22 00:00:00 8.75 5 13 8
## 6 17 2022-04-29 00:00:00 11.5 5 14 8
## 7 18 2022-04-30 00:00:00 12.5 8 17 2
# Among all *foraging queens observed*, what was the most common host plant?
left_join(CapLog,PollenIDgenera.comm,by="BeeID") %>% # alt version of dat.Queen retaining bees without pollen data (but a few of these queens carried pollen, but we don't have data on it)
filter(!is.na(HostGenus)) %>%
group_by(HostGenus) %>%
summarize(bees=n_distinct(BeeID), propbees=n_distinct(BeeID)/414) %>% #414 total foraging bees
arrange(-bees) %>%
mutate(cumsum=cumsum(propbees))
## # A tibble: 33 × 4
## HostGenus bees propbees cumsum
## <chr> <int> <dbl> <dbl>
## 1 "" 169 0.408 0.408
## 2 "Lamium" 132 0.319 0.727
## 3 "Elaeagnus" 107 0.258 0.986
## 4 "Glechoma" 61 0.147 1.13
## 5 "Prunus" 24 0.0580 1.19
## 6 "Mertensia" 18 0.0435 1.23
## 7 "Lonicera" 12 0.0290 1.26
## 8 "Cercis" 8 0.0193 1.28
## 9 "Barbarea" 6 0.0145 1.30
## 10 "Vinca" 6 0.0145 1.31
## # ℹ 23 more rows
# Among all *queens with pollen data*, what was the most common host plant?
dat.Queen %>%
filter(!is.na(HostGenus)) %>%
group_by(HostGenus) %>%
summarize(bees=n_distinct(BeeID), propbees=n_distinct(BeeID)/103) %>% #103 total foraging bees with pollen data
arrange(-bees) %>%
mutate(cumsum=cumsum(propbees))
## # A tibble: 23 × 4
## HostGenus bees propbees cumsum
## <chr> <int> <dbl> <dbl>
## 1 "Elaeagnus" 30 0.291 0.291
## 2 "Glechoma" 19 0.184 0.476
## 3 "Lamium" 15 0.146 0.621
## 4 "Prunus" 8 0.0777 0.699
## 5 "Barbarea" 5 0.0485 0.748
## 6 "" 4 0.0388 0.786
## 7 "Mertensia" 4 0.0388 0.825
## 8 "Pyracantha" 4 0.0388 0.864
## 9 "Malus" 3 0.0291 0.893
## 10 "Cercis" 2 0.0194 0.913
## # ℹ 13 more rows
dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, SiteID, Bombus_spp, Date), by="BeeID") %>%
filter(str_starts(SiteID, "B")) %>% #ONLY BLANDY BEES
mutate(FamGen=paste(Family, Genus, sep=" ") # new varible to invoke in plotting
) %>%
group_by(BeeID, Family, Genus, FamGen) %>%
summarise(
Date=first(Date),
Bombus_spp=first(Bombus_spp),
PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)), # calculate mean prop of reads for each genus for each bee
) %>% ungroup() %>%
mutate(Week=lubridate::week(Date)) %>%
group_by(Week, Family, Genus, FamGen) %>%
summarise(weekly_value=sum(PropReads)) %>% ungroup()
## `summarise()` has grouped output by 'BeeID', 'Family', 'Genus'. You can
## override using the `.groups` argument.
## `summarise()` has grouped output by 'Week', 'Family', 'Genus'. You can override
## using the `.groups` argument.
sort(unique(dat$FamGen))
## [1] "Adoxaceae Viburnum" "Altingiaceae Liquidambar"
## [3] "Amaryllidaceae Leucojum" "Annonaceae Asimina"
## [5] "Apocynaceae Vinca" "Asteraceae Erigeron"
## [7] "Asteraceae Packera" "Asteraceae Taraxacum"
## [9] "Berberidaceae Berberis" "Berberidaceae Podophyllum"
## [11] "Boraginaceae Buglossoides" "Boraginaceae Cynoglossum"
## [13] "Boraginaceae Mertensia" "Boraginaceae Pulmonaria"
## [15] "Brassicaceae Barbarea" "Brassicaceae Brassica"
## [17] "Brassicaceae Cardamine" "Brassicaceae Hesperis"
## [19] "Brassicaceae Mummenhoffia" "Brassicaceae Thlaspi"
## [21] "Caprifoliaceae Lonicera" "Celastraceae Celastrus"
## [23] "Cupressaceae Chamaecyparis" "Cupressaceae Cunninghamia"
## [25] "Elaeagnaceae Elaeagnus" "Fabaceae Cercis"
## [27] "Fabaceae Gleditsia" "Fabaceae Lupinus"
## [29] "Fabaceae Medicago" "Fabaceae Robinia"
## [31] "Fabaceae Securigera" "Fabaceae Trifolium"
## [33] "Fabaceae Virgilia" "Fabaceae Wisteria"
## [35] "Fagaceae Quercus" "Ginkgoaceae Ginkgo"
## [37] "Lamiaceae Glechoma" "Lamiaceae Lamium"
## [39] "Lardizabalaceae Akebia" "Magnoliaceae Liriodendron"
## [41] "Magnoliaceae Magnolia" "Moraceae Morus"
## [43] "NA NA" "Nyssaceae Nyssa"
## [45] "Oleaceae Fontanesia" "Oleaceae Fraxinus"
## [47] "Oleaceae Syringa" "Papaveraceae Chelidonium"
## [49] "Papaveraceae Dicentra" "Papaveraceae Lamprocapnos"
## [51] "Papaveraceae Stylophorum" "Paulowniaceae Paulownia"
## [53] "Pinaceae Picea" "Pinaceae Pinus"
## [55] "Plantaginaceae Veronica" "Platanaceae Platanus"
## [57] "Poaceae Anthoxanthum" "Poaceae Dactylis"
## [59] "Poaceae Lolium" "Poaceae Poa"
## [61] "Polemoniaceae Polemonium" "Ranunculaceae Ranunculus"
## [63] "Rosaceae Crataegus" "Rosaceae Exochorda"
## [65] "Rosaceae Geum" "Rosaceae Photinia"
## [67] "Rosaceae Potentilla" "Rosaceae Prunus"
## [69] "Rosaceae Pyrus" "Rosaceae Rosa"
## [71] "Salicaceae Salix" "Sapindaceae Acer"
## [73] "Sapindaceae Aesculus" "Staphyleaceae Euscaphis"
## [75] "Styracaceae Halesia" "Violaceae Viola"
## [77] "Vitaceae Vitis"
n_distinct(dat$FamGen) #plus NA
## [1] 77
sort(unique(dat$Family))
## [1] "Adoxaceae" "Altingiaceae" "Amaryllidaceae" "Annonaceae"
## [5] "Apocynaceae" "Asteraceae" "Berberidaceae" "Boraginaceae"
## [9] "Brassicaceae" "Caprifoliaceae" "Celastraceae" "Cupressaceae"
## [13] "Elaeagnaceae" "Fabaceae" "Fagaceae" "Ginkgoaceae"
## [17] "Lamiaceae" "Lardizabalaceae" "Magnoliaceae" "Moraceae"
## [21] "Nyssaceae" "Oleaceae" "Papaveraceae" "Paulowniaceae"
## [25] "Pinaceae" "Plantaginaceae" "Platanaceae" "Poaceae"
## [29] "Polemoniaceae" "Ranunculaceae" "Rosaceae" "Salicaceae"
## [33] "Sapindaceae" "Staphyleaceae" "Styracaceae" "Violaceae"
## [37] "Vitaceae"
n_distinct(dat$Family) #plus NA
## [1] 38
#RColorBrewer::display.brewer.all(colorblindFriendly=TRUE)
# ,
dat %>%
ggplot()+
geom_col(aes(x=Week, y=weekly_value,
fill=ifelse(.data$Genus %in% topGenera2$Genus, FamGen,
ifelse(!is.na(.data$Genus), "z_AllOther", NA))),
position = "fill")+
scale_fill_manual(name="FamGen", values = myColors.FamGen, na.value = "gray")+
geom_label(data=PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, SiteID, Date), by="BeeID") %>%
filter(str_starts(SiteID, "B")) %>%
mutate(Week=lubridate::week(Date)) %>%
group_by(Week) %>% summarize(nbees=n_distinct(BeeID)),
aes(x=Week, y=-0.05, label=paste("n = ", nbees, sep="")))+
guides(fill=guide_legend(title = "Family Genus")) +
scale_x_continuous(breaks=c(unique(dat$Week)))+
ylab("Proportion of Reads")+
theme_bw(base_size = 18)+
theme(legend.position = "bottom")
hist((PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, SiteID, Date), by="BeeID") %>% distinct())$Date, breaks=30)
## Warning in breaks[-1L] + breaks[-nB]: NAs produced by integer overflow
# venn diagram: floral surveys intersected with most common genera
site_flowers <- FlorSpp %>% select(Genus) %>% filter(!str_ends(Genus,"NA")) %>% distinct()
pollen_flowers <- PollenIDs %>% drop_na(Genus) %>% select(Family, Genus, Species) %>% distinct(Genus)
gplots::venn(list(site_flowers$Genus, pollen_flowers$Genus), intersections = TRUE)
attr(gplots::venn(list(site_flowers$Genus, pollen_flowers$Genus), intersections = TRUE), "intersections")
## $A
## [1] "Lindera" "Narcissus" "Forsythia" "Phlox" "Corydalis"
## [6] "Claytonia" "Ficaria" "Liriope" "Ilex" "Pryus"
## [11] "Exochardo" "Lobularia" "Hyancinthus" "Sympiocarpus" "Helleborus"
## [16] "Geranium" "Antennaria" "Muscari" "Houstonia" "Erodium"
## [21] "Thalictrum" "Pieris" "Buddleja" "Oxalis" "Stellaria"
## [26] "Ajuga" "Sassafras" "Obolaria" "Fragaria" "Rhododendron"
## [31] "Adonis" "Ornithogalum" "Papaver" "Zizia" "Aquilegia"
## [36] "Gernium" "Arisaema"
##
## $B
## [1] "Paulownia" "Pinus" "Platanus" "Liquidambar"
## [5] "Anthoxanthum" "Quercus" "Dicentra" "Lupinus"
## [9] "Morus" "Polemonium" "Chamaecyparis" "Berberis"
## [13] "Cunninghamia" "Lolium" "Poa" "Medicago"
## [17] "Cynoglossum" "Euscaphis" "Mummenhoffia" "Exochorda"
## [21] "Akebia" "Ginkgo" "Picea" "Halesia"
## [25] "Fraxinus" "Dactylis" "Robinia" "Wisteria"
## [29] "Geum" "Stylophorum" "Virgilia" "Syringa"
## [33] "Leucojum" "Chelidonium" "Photinia" "Fontanesia"
## [37] "Nyssa" "Aesculus" "Rosa" "Gleditsia"
## [41] "Celastrus" "Vitis" "Securigera" "Pulmonaria"
## [45] "Lamprocapnos" "Pachysandra" "Gelsemium" "Scilla"
## [49] "Fagus" "Toxicodendron" "Impatiens" "Ligustrum"
## [53] "Gaylussacia" "Anemone" "Isatis" "Phleum"
##
## $`A:B`
## [1] "Rubus" "Cercis" "Liriodendron" "Acer" "Viola"
## [6] "Pyrus" "Prunus" "Malus" "Lamium" "Taraxacum"
## [11] "Mertensia" "Lonicera" "Glechoma" "Cornus" "Vinca"
## [16] "Veronica" "Elaeagnus" "Cardamine" "Buglossoides" "Ranunculus"
## [21] "Brassica" "Thlaspi" "Barbarea" "Trifolium" "Magnolia"
## [26] "Packera" "Podophyllum" "Viburnum" "Potentilla" "Vaccinium"
## [31] "Alliaria" "Erigeron" "Vicia" "Aronia" "Salix"
## [36] "Crataegus" "Spiraea" "Hesperis" "Asimina"
#two interesting common genera not detected on floral surveys: Halesia and Paulownia (i recently dug up a video of a queen on Halesia lol) but I never noticed a princess tree in bloom.
#the other 4 common genera not detected on floral surveys were trees: Chamaecyparis*, Pinus*, Platanus*, Quercus
# * meanPropReads < 0.05
rm(pollen_flowers, site_flowers)
# how does blooming plant occurrence vary over the season
FlorSpp %>% left_join(., dat.Sites %>%select(SiteID, Date), by="SiteID") %>%
mutate(Week=lubridate::week(Date)) %>%
group_by(Week) %>%
mutate(nsites=n_distinct(SiteID)) %>%
group_by(Week, Genus) %>%
summarize(weekly_value=n_distinct(SiteID), nsites=first(nsites)) %>%
mutate(weekly_value=weekly_value/nsites) %>%
filter(.data$Genus %in% topGenera2$Genus & Week>13) %>%
ggplot()+
geom_line(aes(x=Week, y=weekly_value, col=Genus), linewidth=1, position="jitter") +
scale_color_manual(name="Genus", values = myColors.topGen, na.value = "gray")+
ylim(0,1)
## `summarise()` has grouped output by 'Week'. You can override using the
## `.groups` argument.
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_line()`).
#Cercis peaks in week 15, but remains blooming at about 1/2 the sites for the entire season
#Lamium, Viola is common for the entire season
#Elaeagnus generally increased over time
#Barbarea, Cornus increased over time
#Viburnum appears at the end
#Taraxacum decreased over time
# plot each flower genus occurrence separately
plot<-FlorSpp %>% left_join(., dat.Sites %>% select(SiteID, Date), by="SiteID") %>%
mutate(Week=lubridate::week(Date)) %>%
group_by(Week) %>%
mutate(nsites=n_distinct(SiteID)) %>%
group_by(Week, Genus) %>%
summarize(weekly_value=n_distinct(SiteID), nsites=first(nsites)) %>%
mutate(weekly_value=weekly_value/nsites) %>%
filter(Week>13) %>%
ggplot()+
geom_line(aes(x=Week, y=weekly_value), linewidth=1) +
geom_point(aes(x=Week, y=weekly_value), size=1) +
ylim(0,1)+
facet_wrap(~Genus)
## `summarise()` has grouped output by 'Week'. You can override using the
## `.groups` argument.
suppressMessages(print(plot))
# grab reference databases (sequences with taxonomic info)
rbcL.ref.tax<-"/Users/kelseyschoenemann/Desktop/Bioinformatics/ReferenceDatabases/rbcL-KarenBell_2021.07.08/rbcL_plus_Nov2019adds_Jul2021corrections.dada2.fa"
PLANiTS.sntx.kls <- "/Users/kelseyschoenemann/Desktop/Bioinformatics/ReferenceDatabases/ITS-PLANiTS_2020.03.29/ITS2.SINTAX_format-kls2.fas"
# extract just Family and Genus names from rbcL and ITS2 databases
ref_rbcL <- Biostrings::fasta.index(rbcL.ref.tax)
ref_rbcL_names <- as.data.frame(ref_rbcL$desc); colnames(ref_rbcL_names) <- "names"
ref_rbcL_split_names <- ref_rbcL_names %>%
separate_wider_delim(names, ";", names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species","null")) %>%
select(-null) %>%
mutate(Family=str_extract(Family, "(?<=f__)[a-zA-Z\\s]+(?=_)"),
Genus=str_extract(Genus, "(?<=g__)[a-zA-Z\\s]+(?=_)"),
Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+(?=_)"),
Barcode="rbcL") %>%
mutate(Family = # correcting out-dated taxonomic names inherited from rbcL reference database (affecting rbcL samples only)
ifelse(.data$Family=="Aceraceae", "Sapindaceae", #change Acer../Hipp.. to Sapi..
ifelse(.data$Family=="Hippocastanaceae", "Sapindaceae",
ifelse(.data$Genus=="Nyssa", "Nyssaceae", #change Corn.. to Nyss (I only have one species from Cornaceae in my data, Nyssa sylvatica, which is in Nyssaceae family) link: https://doi.org/10.1111/boj.12385
.data$Family)))) %>%
mutate(FamGen=paste(Family,Genus,sep = "_")) %>%
group_by(Barcode, Family, FamGen, Genus) %>%
summarise(n=n_distinct(Species)) %>%
filter(!str_ends(FamGen,"_NA")) %>%
filter(!str_ends(FamGen,"_undef"))
## `summarise()` has grouped output by 'Barcode', 'Family', 'FamGen'. You can
## override using the `.groups` argument.
rm(ref_rbcL, ref_rbcL_names)
ref_its2<-Biostrings::fasta.index(PLANiTS.sntx.kls)
ref_its2_names<-as.data.frame(ref_its2$desc); colnames(ref_its2_names) <- "names"
ref_its2_split_names <- ref_its2_names %>%
separate_wider_delim(names, ";", names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species","null"), too_many = "merge") %>%
select(-null) %>%
mutate(Family=str_extract(Family, "(?<=f_)[a-zA-Z\\s]+"),
Genus=str_extract(Genus, "(?<=g_)[a-zA-Z\\s]+"),
Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+"),
FamGen=paste(Family,Genus,sep = "_"),
Barcode="ITS2") %>%
group_by(Barcode, Family, FamGen, Genus) %>%
summarise(n=n_distinct(Species)) %>%
filter(!str_ends(FamGen,"_NA"))
## `summarise()` has grouped output by 'Barcode', 'Family', 'FamGen'. You can
## override using the `.groups` argument.
rm(ref_its2, ref_its2_names)
# NOTE: ITS2 db has several Genera listed in 2 Families:
# ref_its2_names %>%
# separate_wider_delim(names, ";", names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species","null"), too_many = "merge") %>%
# select(-null) %>%
# mutate(Family=str_extract(Family, "(?<=f_)[a-zA-Z\\s]+"),
# Genus=str_extract(Genus, "(?<=g_)[a-zA-Z\\s]+"),
# Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+"),
# FamGen=paste(Family,Genus,sep = "_"),
# Barcode="ITS2") %>%
# group_by(Genus) %>% summarize(n_Family=n_distinct(Family)) %>% filter(n_Family!=1)
# NOTE CONT: but none of these Genera appear in my data
# (ref_its2_names %>%
# separate_wider_delim(names, ";", names = c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species","null"), too_many = "merge") %>%
# select(-null) %>%
# mutate(Family=str_extract(Family, "(?<=f_)[a-zA-Z\\s]+"),
# Genus=str_extract(Genus, "(?<=g_)[a-zA-Z\\s]+"),
# Species=str_extract(Species, "(?<=\\s)[a-zA-Z\\s]+"),
# FamGen=paste(Family,Genus,sep = "_"),
# Barcode="ITS2") %>%
# group_by(Genus) %>% summarize(n_Family=n_distinct(Family)) %>% filter(n_Family!=1))$Genus %in% PollenIDs$Genus
# create list of Family-Genera detected in rbcL and ITS2 samples separately
rbcL_assigns<-PollenIDs %>%
group_by(Barcode) %>%
mutate(FamGen=paste0(Family,"_",Genus)) %>%
filter(!str_ends(FamGen,"_NA")) %>%
filter(Barcode=="rbcL") %>%
ungroup() %>%
select(Family, FamGen,Genus) %>%
distinct()
ITS2_assigns<-PollenIDs %>%
group_by(Barcode) %>%
mutate(FamGen=paste0(Family,"_",Genus)) %>%
filter(!str_ends(FamGen,"_NA")) %>%
filter(Barcode=="ITS2") %>%
ungroup() %>%
select(Family, FamGen,Genus) %>%
distinct()
ref_assigns<-PollenIDs %>%
mutate(FamGen=paste0(Family,"_",Genus)) %>%
filter(!str_ends(FamGen,"_NA")) %>%
select(Family, FamGen,Genus) %>%
distinct()
knitr::kable(PollenIDs$Family %>% as_tibble() %>% distinct()) # a list of all Families detected in my data
| value |
|---|
| Rosaceae |
| Caprifoliaceae |
| Fabaceae |
| NA |
| Paulowniaceae |
| Pinaceae |
| Platanaceae |
| Salicaceae |
| Adoxaceae |
| Altingiaceae |
| Brassicaceae |
| Elaeagnaceae |
| Lamiaceae |
| Poaceae |
| Violaceae |
| Boraginaceae |
| Fagaceae |
| Papaveraceae |
| Moraceae |
| Polemoniaceae |
| Cupressaceae |
| Plantaginaceae |
| Asteraceae |
| Sapindaceae |
| Annonaceae |
| Apocynaceae |
| Berberidaceae |
| Staphyleaceae |
| Lardizabalaceae |
| Magnoliaceae |
| Ginkgoaceae |
| Styracaceae |
| Oleaceae |
| Ranunculaceae |
| Amaryllidaceae |
| Nyssaceae |
| Celastraceae |
| Vitaceae |
| Buxaceae |
| Gelsemiaceae |
| Hyacinthaceae |
| Ericaceae |
| Anacardiaceae |
| Balsaminaceae |
| Cornaceae |
# venn diagram of Family-Genus entries in the ITS2 and rbcL dataBASES, and detected Family-Genera
venn_refs<-gplots::venn(list(
ref_its2_split_names$FamGen, # A
ref_rbcL_split_names$FamGen, # B
ref_assigns$FamGen), intersections = TRUE) # C
attr(gplots::venn(list( # similar to above but at the Family level. There are 0 Families detected by just a single dataset
ref_its2_split_names$Family %>% unique(), # A
ref_rbcL_split_names$Family %>% unique(), # B
ref_assigns$Family), intersections = TRUE), "intersections") # C
## $A
## [1] "Acrobolbaceae" "Actinochloridaceae" "Adelanthaceae"
## [4] "Allisoniaceae" "Amblystegiaceae" "Anastrophyllaceae"
## [7] "Aneuraceae" "Anomodontaceae" "Antheliaceae"
## [10] "Aphanochaetaceae" "Asphodelaceae" "Aspleniaceae"
## [13] "Asteromonadaceae" "Astrephomenaceae" "Aulacomniaceae"
## [16] "Balanophoraceae" "Barrancaceae" "Bartramiaceae"
## [19] "Bathycoccaceae" "Blechnaceae" "Blepharostomataceae"
## [22] "Brachytheciaceae" "Bracteacoccaceae" "Brevianthaceae"
## [25] "Bruchiaceae" "Bryaceae" "Bryobartramiaceae"
## [28] "Bryoxiphiaceae" "Callicladiaceae" "Calliergonaceae"
## [31] "Calymperaceae" "Calypogeiaceae" "Catagoniaceae"
## [34] "Caulerpaceae" "Cephaloziaceae" "Cephaloziellaceae"
## [37] "Chaetopeltidaceae" "Chaetophoraceae" "Characeae"
## [40] "Chenopodiaceae" "Chlamydomonadaceae" "Chlorellaceae"
## [43] "Chlorococcaceae" "Chlorodendraceae" "Chloropicaceae"
## [46] "Chromochloridaceae" "Cladophoraceae" "Climaciaceae"
## [49] "Cloniophoraceae" "Coldeniaceae" "Corbichoniaceae"
## [52] "Corsiaceae" "Crustomastigaceae" "Cryphaeaceae"
## [55] "Cyatheaceae" "Cyathodiaceae" "Cynomoriaceae"
## [58] "Daltoniaceae" "Delavayellaceae" "Dendrocerotaceae"
## [61] "Desmidiaceae" "Dicksoniaceae" "Dicranaceae"
## [64] "Dictyochloridaceae" "Dictyococcaceae" "Disceliaceae"
## [67] "Ditrichaceae" "Dolichomastigaceae" "Drummondiaceae"
## [70] "Dryopteridaceae" "Dumortieraceae" "Dunaliellaceae"
## [73] "Echinodiaceae" "Encalyptaceae" "Entodontaceae"
## [76] "Equisetaceae" "Fabroniaceae" "Fissidentaceae"
## [79] "Flatbergiaceae" "Fontinalaceae" "Fossombroniaceae"
## [82] "Francoaceae" "Fritschiellaceae" "Frullaniaceae"
## [85] "Funariaceae" "Gigaspermaceae" "Grimmiaceae"
## [88] "Gymnomitriaceae" "Haematococcaceae" "Halimedaceae"
## [91] "Hedwigiaceae" "Helodiaceae" "Herbertaceae"
## [94] "Hookeriaceae" "Hydrodictyaceae" "Hylocomiaceae"
## [97] "Hymenophytaceae" "Hypnaceae" "Hypnodendraceae"
## [100] "Hypopterygiaceae" "Isoetaceae" "Jamesoniellaceae"
## [103] "Jocheniaceae" "Jubulaceae" "Jungermanniaceae"
## [106] "Klebsormidiaceae" "Koliellaceae" "Kornmanniaceae"
## [109] "Leeaceae" "Lejeuneaceae" "Lembophyllaceae"
## [112] "Lennoaceae" "Lepidolaenaceae" "Lepidoziaceae"
## [115] "Leptodontaceae" "Leptosiraceae" "Lepyrodontaceae"
## [118] "Leskeaceae" "Leucobryaceae" "Leucodontaceae"
## [121] "Leucomiaceae" "Lindsaeaceae" "Lophocoleaceae"
## [124] "Lophoziaceae" "Lycopodiaceae" "Lygodiaceae"
## [127] "Mamiellaceae" "Marchantiaceae" "Marsileaceae"
## [130] "Mastigophoraceae" "Meesiaceae" "Meteoriaceae"
## [133] "Metzgeriaceae" "Microteaceae" "Miyabeaceae"
## [136] "Mniaceae" "Moerckiaceae" "Monocleaceae"
## [139] "Monomastigaceae" "Monostromataceae" "Mychonastaceae"
## [142] "Myliaceae" "Myriniaceae" "Mystropetalaceae"
## [145] "Myuriaceae" "NA" "Namaceae"
## [148] "Neckeraceae" "Neochloridaceae" "Notothyladaceae"
## [151] "Onocleaceae" "Oocystaceae" "Orthodontiaceae"
## [154] "Orthotrichaceae" "Pallaviciniaceae" "Pedinomonadaceae"
## [157] "Pelliaceae" "Petiveriaceae" "Phacotaceae"
## [160] "Phyllogoniaceae" "Pilotrichaceae" "Pithophoraceae"
## [163] "Plagiochilaceae" "Plagiotheciaceae" "Polypodiaceae"
## [166] "Polytrichaceae" "Porellaceae" "Pottiaceae"
## [169] "Prasiolaceae" "Prionodontaceae" "Protosiphonaceae"
## [172] "Pseudocladophoraceae" "Pseudomuriellaceae" "Psilotaceae"
## [175] "Pteridaceae" "Pterigynandraceae" "Pterobryaceae"
## [178] "Ptilidiaceae" "Ptychomniaceae" "Pylaisiaceae"
## [181] "Pylaisiadelphaceae" "Radiococcaceae" "Radulaceae"
## [184] "Regmatodontaceae" "Rhabdoweisiaceae" "Rhacocarpaceae"
## [187] "Rhizogoniaceae" "Rhytidiaceae" "Ricciaceae"
## [190] "Rotundellaceae" "Rutenbergiaceae" "Saelaniaceae"
## [193] "Salviniaceae" "Saulomataceae" "Scapaniaceae"
## [196] "Scenedesmaceae" "Schimperobryaceae" "Schizochlamydaceae"
## [199] "Schizomeridaceae" "Schroederiaceae" "Scouleriaceae"
## [202] "Selaginellaceae" "Selenastraceae" "Seligeriaceae"
## [205] "Sematophyllaceae" "Siphonocladaceae" "Solenostomataceae"
## [208] "Sphaeropleaceae" "Sphagnaceae" "Splachnaceae"
## [211] "Spondylomoraceae" "Stereodontaceae" "Stereophyllaceae"
## [214] "Symphyodontaceae" "Taccaceae" "Takakiaceae"
## [217] "Taxiphyllaceae" "Tetrabaenaceae" "Tetraphidaceae"
## [220] "Theliaceae" "Thismiaceae" "Thuidiaceae"
## [223] "Trachylomataceae" "Trebouxiaceae" "Trentepohliaceae"
## [226] "Tumidellaceae" "Tupiellaceae" "Udoteaceae"
## [229] "Ulotrichaceae" "Ulvaceae" "Ulvellaceae"
## [232] "Uronemataceae" "Volvocaceae" "Zygnemataceae"
##
## $B
## [1] "Achatocarpaceae" "Aextoxicaceae" "Amborellaceae"
## [4] "Anarthriaceae" "Aphloiaceae" "Asteliaceae"
## [7] "Asteropeiaceae" "Atherospermataceae" "Balanopaceae"
## [10] "Barbeuiaceae" "Barbeyaceae" "Bataceae"
## [13] "Berberidopsidaceae" "Biebersteiniaceae" "Blandfordiaceae"
## [16] "Borthwickiaceae" "Brunelliaceae" "Campynemataceae"
## [19] "Cardiopteridaceae" "Centrolepidaceae" "Centroplacaceae"
## [22] "Cephalotaceae" "Columelliaceae" "Ctenolophonaceae"
## [25] "Curtisiaceae" "Cyclanthaceae" "Dasypogonaceae"
## [28] "Degeneriaceae" "Dipentodontaceae" "Dirachmaceae"
## [31] "Doryanthaceae" "Ecdeiocoleaceae" "Emblingiaceae"
## [34] "Escalloniaceae" "Eupomatiaceae" "Geissolomataceae"
## [37] "Gerrardinaceae" "Goupiaceae" "Grubbiaceae"
## [40] "Guamatelaceae" "Haemodoraceae" "Hernandiaceae"
## [43] "Himantandraceae" "Huaceae" "Hydrostachyaceae"
## [46] "Irvingiaceae" "Ixonanthaceae" "Koeberliniaceae"
## [49] "Lacistemataceae" "Lactoridaceae" "Lanariaceae"
## [52] "Lepidobotryaceae" "Lophopyxidaceae" "Macarthuriaceae"
## [55] "Mayacaceae" "Melianthaceae" "Montiniaceae"
## [58] "Myrothamnaceae" "Nanodeaceae" "Octoknemaceae"
## [61] "Olacaceae" "Oncothecaceae" "Pandaceae"
## [64] "Paracryphiaceae" "Pentaphragmataceae" "Peridiscaceae"
## [67] "Petenaeaceae" "Petermanniaceae" "Petrosaviaceae"
## [70] "Phellinaceae" "Philydraceae" "Physenaceae"
## [73] "Plocospermataceae" "Posidoniaceae" "Quillajaceae"
## [76] "Rehmanniaceae" "Restionaceae" "Rhabdodendraceae"
## [79] "Schlegeliaceae" "Setchellanthaceae" "Simmondsiaceae"
## [82] "Sphaerosepalaceae" "Sphenocleaceae" "Stemonaceae"
## [85] "Strasburgeriaceae" "Strombosiaceae" "Surianaceae"
## [88] "Tecophilaeaceae" "Thurniaceae" "Torricelliaceae"
## [91] "Trigoniaceae" "Trimeniaceae" "Vahliaceae"
## [94] "Velloziaceae" "Vivianiaceae" "Welwitschiaceae"
## [97] "Xanthoceraceae" "Xanthorrhoeaceae" "Xeronemataceae"
## [100] "undef"
##
## $`A:B`
## [1] "Acanthaceae" "Achariaceae" "Acoraceae"
## [4] "Actinidiaceae" "Agdestidaceae" "Aizoaceae"
## [7] "Akaniaceae" "Alismataceae" "Alseuosmiaceae"
## [10] "Alstroemeriaceae" "Alzateaceae" "Amaranthaceae"
## [13] "Amphorogynaceae" "Anacampserotaceae" "Ancistrocladaceae"
## [16] "Anisophylleaceae" "Aphanopetalaceae" "Apiaceae"
## [19] "Aponogetonaceae" "Aptandraceae" "Aquifoliaceae"
## [22] "Araceae" "Araliaceae" "Araucariaceae"
## [25] "Arecaceae" "Argophyllaceae" "Aristolochiaceae"
## [28] "Asparagaceae" "Austrobaileyaceae" "Basellaceae"
## [31] "Begoniaceae" "Betulaceae" "Bignoniaceae"
## [34] "Bixaceae" "Bonnetiaceae" "Boryaceae"
## [37] "Bromeliaceae" "Bruniaceae" "Burmanniaceae"
## [40] "Burseraceae" "Butomaceae" "Byblidaceae"
## [43] "Cabombaceae" "Cactaceae" "Calceolariaceae"
## [46] "Calophyllaceae" "Calycanthaceae" "Calyceraceae"
## [49] "Campanulaceae" "Canellaceae" "Cannabaceae"
## [52] "Cannaceae" "Capparaceae" "Caricaceae"
## [55] "Carlemanniaceae" "Caryocaraceae" "Caryophyllaceae"
## [58] "Casuarinaceae" "Ceratophyllaceae" "Cercidiphyllaceae"
## [61] "Cervantesiaceae" "Chloranthaceae" "Chrysobalanaceae"
## [64] "Circaeasteraceae" "Cistaceae" "Cleomaceae"
## [67] "Clethraceae" "Clusiaceae" "Codonaceae"
## [70] "Colchicaceae" "Comandraceae" "Combretaceae"
## [73] "Commelinaceae" "Connaraceae" "Convolvulaceae"
## [76] "Cordiaceae" "Coriariaceae" "Corynocarpaceae"
## [79] "Costaceae" "Coulaceae" "Crassulaceae"
## [82] "Crossosomataceae" "Crypteroniaceae" "Cucurbitaceae"
## [85] "Cunoniaceae" "Cycadaceae" "Cymodoceaceae"
## [88] "Cyperaceae" "Cyrillaceae" "Daphniphyllaceae"
## [91] "Datiscaceae" "Diapensiaceae" "Dichapetalaceae"
## [94] "Didiereaceae" "Dilleniaceae" "Dioncophyllaceae"
## [97] "Dioscoreaceae" "Dipterocarpaceae" "Droseraceae"
## [100] "Drosophyllaceae" "Ebenaceae" "Ehretiaceae"
## [103] "Elaeocarpaceae" "Elatinaceae" "Ephedraceae"
## [106] "Eriocaulaceae" "Erythropalaceae" "Erythroxylaceae"
## [109] "Eucommiaceae" "Euphorbiaceae" "Euphroniaceae"
## [112] "Eupteleaceae" "Flagellariaceae" "Fouquieriaceae"
## [115] "Frankeniaceae" "Garryaceae" "Gentianaceae"
## [118] "Geraniaceae" "Gesneriaceae" "Gisekiaceae"
## [121] "Gnetaceae" "Gomortegaceae" "Goodeniaceae"
## [124] "Griseliniaceae" "Grossulariaceae" "Gunneraceae"
## [127] "Gyrostemonaceae" "Halophytaceae" "Haloragaceae"
## [130] "Hamamelidaceae" "Hanguanaceae" "Heliconiaceae"
## [133] "Heliotropiaceae" "Helwingiaceae" "Humiriaceae"
## [136] "Hydatellaceae" "Hydrangeaceae" "Hydrocharitaceae"
## [139] "Hydroleaceae" "Hydrophyllaceae" "Hypericaceae"
## [142] "Hypoxidaceae" "Icacinaceae" "Iridaceae"
## [145] "Iteaceae" "Ixioliriaceae" "Joinvilleaceae"
## [148] "Juglandaceae" "Juncaceae" "Juncaginaceae"
## [151] "Kewaceae" "Krameriaceae" "Lauraceae"
## [154] "Lecythidaceae" "Lentibulariaceae" "Liliaceae"
## [157] "Limeaceae" "Limnanthaceae" "Linaceae"
## [160] "Linderniaceae" "Loasaceae" "Loganiaceae"
## [163] "Lophiocarpaceae" "Loranthaceae" "Lowiaceae"
## [166] "Lythraceae" "Malpighiaceae" "Malvaceae"
## [169] "Marantaceae" "Marcgraviaceae" "Martyniaceae"
## [172] "Maundiaceae" "Mazaceae" "Melanthiaceae"
## [175] "Melastomataceae" "Meliaceae" "Menispermaceae"
## [178] "Menyanthaceae" "Metteniusaceae" "Misodendraceae"
## [181] "Molluginaceae" "Monimiaceae" "Montiaceae"
## [184] "Moringaceae" "Muntingiaceae" "Musaceae"
## [187] "Myodocarpaceae" "Myricaceae" "Myristicaceae"
## [190] "Myrtaceae" "Nartheciaceae" "Nelumbonaceae"
## [193] "Nepenthaceae" "Neuradaceae" "Nitrariaceae"
## [196] "Nothofagaceae" "Nyctaginaceae" "Nymphaeaceae"
## [199] "Ochnaceae" "Onagraceae" "Opiliaceae"
## [202] "Orchidaceae" "Orobanchaceae" "Oxalidaceae"
## [205] "Paeoniaceae" "Pandanaceae" "Passifloraceae"
## [208] "Pedaliaceae" "Penaeaceae" "Pennantiaceae"
## [211] "Pentaphylacaceae" "Penthoraceae" "Peraceae"
## [214] "Philesiaceae" "Phrymaceae" "Phyllanthaceae"
## [217] "Phyllonomaceae" "Phytolaccaceae" "Picramniaceae"
## [220] "Picrodendraceae" "Piperaceae" "Pittosporaceae"
## [223] "Plumbaginaceae" "Podocarpaceae" "Podostemaceae"
## [226] "Polygalaceae" "Polygonaceae" "Pontederiaceae"
## [229] "Portulacaceae" "Potamogetonaceae" "Primulaceae"
## [232] "Proteaceae" "Putranjivaceae" "Rapateaceae"
## [235] "Resedaceae" "Rhamnaceae" "Rhizophoraceae"
## [238] "Ripogonaceae" "Roridulaceae" "Rousseaceae"
## [241] "Rubiaceae" "Rutaceae" "Sabiaceae"
## [244] "Salvadoraceae" "Santalaceae" "Sapotaceae"
## [247] "Sarcobataceae" "Sarcolaenaceae" "Sarraceniaceae"
## [250] "Saururaceae" "Saxifragaceae" "Scheuchzeriaceae"
## [253] "Schisandraceae" "Schoepfiaceae" "Sciadopityaceae"
## [256] "Scrophulariaceae" "Simaroubaceae" "Siparunaceae"
## [259] "Sladeniaceae" "Smilacaceae" "Solanaceae"
## [262] "Stachyuraceae" "Stegnospermataceae" "Stemonuraceae"
## [265] "Stilbaceae" "Strelitziaceae" "Stylidiaceae"
## [268] "Symplocaceae" "Talinaceae" "Tamaricaceae"
## [271] "Tapisciaceae" "Taxaceae" "Tetracarpaeaceae"
## [274] "Tetrachondraceae" "Tetramelaceae" "Tetrameristaceae"
## [277] "Theaceae" "Thesiaceae" "Thomandersiaceae"
## [280] "Thymelaeaceae" "Ticodendraceae" "Tofieldiaceae"
## [283] "Tovariaceae" "Triuridaceae" "Trochodendraceae"
## [286] "Tropaeolaceae" "Typhaceae" "Ulmaceae"
## [289] "Urticaceae" "Verbenaceae" "Viscaceae"
## [292] "Vochysiaceae" "Wellstediaceae" "Winteraceae"
## [295] "Ximeniaceae" "Xyridaceae" "Zamiaceae"
## [298] "Zingiberaceae" "Zosteraceae" "Zygophyllaceae"
##
## $`A:B:C`
## [1] "Adoxaceae" "Altingiaceae" "Amaryllidaceae" "Anacardiaceae"
## [5] "Annonaceae" "Apocynaceae" "Asteraceae" "Balsaminaceae"
## [9] "Berberidaceae" "Boraginaceae" "Brassicaceae" "Buxaceae"
## [13] "Caprifoliaceae" "Celastraceae" "Cornaceae" "Cupressaceae"
## [17] "Elaeagnaceae" "Ericaceae" "Fabaceae" "Fagaceae"
## [21] "Gelsemiaceae" "Ginkgoaceae" "Hyacinthaceae" "Lamiaceae"
## [25] "Lardizabalaceae" "Magnoliaceae" "Moraceae" "Nyssaceae"
## [29] "Oleaceae" "Papaveraceae" "Paulowniaceae" "Pinaceae"
## [33] "Plantaginaceae" "Platanaceae" "Poaceae" "Polemoniaceae"
## [37] "Ranunculaceae" "Rosaceae" "Salicaceae" "Sapindaceae"
## [41] "Staphyleaceae" "Styracaceae" "Violaceae" "Vitaceae"
attr(venn_refs, "intersections")[["A:C"]] #Genus present in ITS2 db & detected, but not in rbcL db (ONE TAXON)
## [1] "Brassicaceae_Mummenhoffia"
attr(venn_refs, "intersections")[["B:C"]] #Genus present in rbcL db & detected, but not in ITS2 db (NONE)
## NULL
# the two datasets were capable of detecting virtually all the same taxa
# VennDiagram::venn.diagram( #making a *pretty* diagram (same info as three-way venn above)
# list(ref_its2_split_names$FamGen,
# ref_rbcL_split_names$FamGen,
# ref_assigns$FamGen),
# # main="Genera Shared Between Reference Databases",
# # main.pos = c(0.5,0.95),
# # main.fontfamily = "sans",
# # main.fontface = "plain",
# fontfamily = c(rep("sans",7)),
# fontface = c(rep("plain",3), rep("bold",4)),
# cex = c(rep(2,3), rep(3,4)),
# cat.fontface = c("plain","plain","bold"),
# cat.fontfamily=c("sans","sans", "sans"),
# category.names = c("ITS2 reference \n database", "rbcL reference \n database", "detected \n in this project"),
# fill = c("#F8766D", "#00BFC4", "white"),
# alpha = c(0.5, 0.5, 0),
# lwd = c(0, 0, 2),
# cat.pos=c(25, 335,180), # these are ordered: rbcL, ITS, detected (because i flipped the plot 180 below)
# cat.dist=c(0.08,0.08, -0.04),
# rotation.degree=180,
# "Figure_5_left.tiff") #saves to the current working directory
# venn diagram of Genus assignments for the ITS2 and rbcL dataSETS
venn_assigns<-gplots::venn(
list(ITS2_assigns$Genus,
rbcL_assigns$Genus), intersections = TRUE)
attr(venn_assigns, "intersections")[["A"]] #22 genera detected only in ITS2 samples
## [1] "Mertensia" "Taraxacum" "Cardamine" "Buglossoides" "Mummenhoffia"
## [6] "Exochorda" "Packera" "Halesia" "Geum" "Thlaspi"
## [11] "Crataegus" "Erigeron" "Leucojum" "Chelidonium" "Photinia"
## [16] "Fontanesia" "Celastrus" "Malus" "Fagus" "Impatiens"
## [21] "Gaylussacia" "Anemone"
attr(venn_assigns, "intersections")[["A:B"]] #29 genera detected by both
## [1] "Prunus" "Quercus" "Dicentra" "Lamium" "Polemonium"
## [6] "Viola" "Potentilla" "Medicago" "Cercis" "Acer"
## [11] "Robinia" "Platanus" "Podophyllum" "Viburnum" "Ranunculus"
## [16] "Syringa" "Paulownia" "Hesperis" "Aesculus" "Rosa"
## [21] "Pulmonaria" "Lamprocapnos" "Stylophorum" "Aronia" "Pyrus"
## [26] "Vicia" "Berberis" "Vaccinium" "Spiraea"
attr(venn_assigns, "intersections")[["B"]] #43 genera detected only in rbcL samples
## [1] "Lonicera" "Pinus" "Salix" "Liquidambar"
## [5] "Brassica" "Elaeagnus" "Glechoma" "Anthoxanthum"
## [9] "Lupinus" "Morus" "Chamaecyparis" "Veronica"
## [13] "Asimina" "Vinca" "Cunninghamia" "Lolium"
## [17] "Poa" "Cynoglossum" "Euscaphis" "Akebia"
## [21] "Magnolia" "Trifolium" "Ginkgo" "Picea"
## [25] "Barbarea" "Fraxinus" "Dactylis" "Wisteria"
## [29] "Virgilia" "Liriodendron" "Nyssa" "Gleditsia"
## [33] "Vitis" "Securigera" "Pachysandra" "Gelsemium"
## [37] "Scilla" "Toxicodendron" "Ligustrum" "Cornus"
## [41] "Alliaria" "Rubus" "Isatis" "Phleum"
# although the two databases contained the same genera detected in my study, certain detected genera only appeared in one dataset or the other
# venn diagram of Genus assignments for the ITS2 and rbcL dataSETS, and most common genera
venn_assigns_common<-gplots::venn(
list(ITS2_assigns$Genus,
rbcL_assigns$Genus,
topGenera2$Genus), intersections = TRUE)
Genera_ITS2<-attr(venn_assigns_common, "intersections")[["A:C"]] #4 common genera detected only in ITS2 samples
Genera_Both<-attr(venn_assigns_common, "intersections")[["A:B:C"]] #9 common genera detected by both
Genera_rbcL<-attr(venn_assigns_common, "intersections")[["B:C"]] #7 common genera detected only in rbcL samples
# VennDiagram::venn.diagram( #making a *pretty* diagram (same info as three-way venn above)
# list(ITS2_assigns$Genus,
# rbcL_assigns$Genus,
# topGenera2$Genus),
# # main="Genera Shared Between Barcode Datasets",
# # main.pos = c(0.5,0.95),
# # main.fontfamily = "sans",
# # main.fontface = "plain",
# fontfamily = c(rep("sans",7)),
# fontface = c(rep("plain",3), rep("bold",4)),
# cex = c(rep(2,3), rep(3,4)),
# cat.fontface = c("plain","plain","bold"),
# cat.fontfamily=c("sans","sans","sans"),
# category.names = c("genera detected in \n ITS2 samples", "genera detected in \n rbcL samples", "most common genera"),
# fill = c("#F8766D", "#00BFC4", "white"),
# alpha = c(0.5, 0.5, 0),
# lwd = c(0, 0, 2),
# # lty = c(1,1,1),
# # col = c("black","black","white"),
# # cat.cex=c(2.5,2.5,0),
# cat.pos=c(335, 25, 180),
# cat.dist=c(0.08, 0.08, -0.2),
# "Figure_5_right.tiff") #saves to the current working directory
# grid::grid.raster(tiff::readTIFF("file_name.tiff")) # view the plot
# use some lists and functions to return a table with counts of intersections
list<-list(
most_common=topGenera2$Genus,
its2_detect=ITS2_assigns$Genus,
rbcL_detect=rbcL_assigns$Genus
)
combs <- unlist(lapply(1:length(list),
function(j) combn(names(list), j, simplify = FALSE)),
recursive = FALSE)
names(combs) <- sapply(combs, function(i) paste0(i, collapse = ""))
str(combs)
## List of 7
## $ most_common : chr "most_common"
## $ its2_detect : chr "its2_detect"
## $ rbcL_detect : chr "rbcL_detect"
## $ most_commonits2_detect : chr [1:2] "most_common" "its2_detect"
## $ most_commonrbcL_detect : chr [1:2] "most_common" "rbcL_detect"
## $ its2_detectrbcL_detect : chr [1:2] "its2_detect" "rbcL_detect"
## $ most_commonits2_detectrbcL_detect: chr [1:3] "most_common" "its2_detect" "rbcL_detect"
Intersect <- function (x) {
# Multiple set version of intersect
# x is a list
if (length(x) == 1) {
unlist(x)
} else if (length(x) == 2) {
intersect(x[[1]], x[[2]])
} else if (length(x) > 2){
intersect(x[[1]], Intersect(x[-1]))
}
}
Union <- function (x) {
# Multiple set version of union
# x is a list
if (length(x) == 1) {
unlist(x)
} else if (length(x) == 2) {
union(x[[1]], x[[2]])
} else if (length(x) > 2) {
union(x[[1]], Union(x[-1]))
}
}
Setdiff <- function (x, y) {
# Remove the union of the y's from the common x's.
# x and y are lists of characters.
xx <- Intersect(x)
yy <- Union(y)
setdiff(xx, yy)
}
elements <-
lapply(combs, function(i) Setdiff(list[i], list[setdiff(names(list), i)]))
n.elements <- sapply(elements, length)
print(n.elements)
## most_common its2_detect
## 0 18
## rbcL_detect most_commonits2_detect
## 37 4
## most_commonrbcL_detect its2_detectrbcL_detect
## 7 19
## most_commonits2_detectrbcL_detect
## 10
#from lines list<-list() ...to... print(n.elements): this code was largely copied from https://stackoverflow.com/questions/23559371/get-the-list-of-items-in-venn-diagram
rm(list, combs, Intersect, Union, Setdiff, elements, n.elements)
rm(rbcL_assigns, ITS2_assigns, ref_assigns)
#plot prop reads assigned to the top 21 genera by barcode
ggplot()+
geom_col(data=PollenIDs %>% mutate(FamGen=paste(Family, Genus, sep=" ")),
aes(x=Barcode, y=PropReads,
fill = ifelse(.data$Genus %in% topGenera2$Genus, FamGen,
ifelse(!is.na(.data$Genus), "z_AllOther", NA))),
position="fill")+
geom_label(data=PollenIDs %>% group_by(Barcode) %>% summarize(nbees=n_distinct(BeeID)),
aes(x=Barcode, y=-0.05, label=paste("n = ", nbees, sep="")))+
scale_fill_manual(name="FamGen", values = myColors.FamGen, na.value = "gray")+
guides(fill=guide_legend(title = ""))+
labs(y="Proportion of Reads") +
theme_bw(base_size = 16)+
theme(legend.position = "bottom",
legend.text = element_text(size=10),
legend.box.just = "left")
# prop reads assigned to top genera UNIQUE to each barcode
PollenIDs %>%
mutate(FamGen=paste(Family, Genus, sep=" ")) %>%
ggplot(aes(x=Barcode,
y=PropReads,
fill = ifelse(.data$Genus %in% Genera_Both, "z_Shared",
ifelse(.data$Genus %in% topGenera2$Genus, FamGen,
ifelse(!is.na(.data$Genus), "z_AllOther", NA)))))+
geom_col(position="fill")+
scale_fill_manual(name="FamGen", values = myColors.FamGen, na.value = "gray")+
guides(fill=guide_legend(title = "Family Genus"))+
theme_bw()
#to plot side by side the prop reads assigned to Genus for the two barcodes
#creating a vector of sample names that have both rbcL and ITS2 data
both<-as.vector((PollenIDs %>% group_by(BeeID, Barcode) %>% summarise(BeeID=first(BeeID), Barcode=first(Barcode)) %>% group_by(BeeID) %>% summarise(Barcodes=n()) %>%
filter(Barcodes==2))$BeeID) #64 pollen samples with rbcL and ITS2 data
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
dim(CapLog %>% select(BeeID, CarryPollen) %>% filter(CarryPollen==1)) # 130 queens carrying pollen
## [1] 130 2
dim(CapLog %>% select(BeeID, SampleID, CarryPollen) %>% filter(CarryPollen==1&SampleID!=0)) # 111 pollen samples collected
## [1] 111 3
dim(PollenIDs %>% select(BeeID) %>% distinct()) # 107 samples with pollen data
## [1] 107 1
#the proportion of reads assigned to genera, split up by each individual pollen sample/bee; the same colors means the two barcodes agree with pollen genera composition
PollenIDs %>%
filter(BeeID %in% both) %>%
ggplot(aes(x=Barcode,y=PropReads,fill=Genus))+
geom_col(position="fill")+
facet_wrap(~BeeID)+
theme(legend.position = "none")
# do the barcode-specific genera appear in samples with only one or the other barcode?
#among samples in which genera unique to ITS2 were detected, how many samples had data from one or both barcodes?
samples_with_ITS2Only_genera<-PollenIDs %>% filter(.data$Genus %in% Genera_ITS2) %>% distinct(BeeID)
PollenIDs %>% filter(.data$BeeID %in% samples_with_ITS2Only_genera$BeeID) %>% select(BeeID,Barcode) %>% group_by(BeeID) %>% summarize(NBarcodes=n_distinct(Barcode)) %>% group_by(NBarcodes) %>% summarize(NSamples=n_distinct(BeeID))
## # A tibble: 2 × 2
## NBarcodes NSamples
## <int> <int>
## 1 1 6
## 2 2 40
samples_with_rbcLOnly_genera<-PollenIDs %>% filter(.data$Genus %in% Genera_rbcL) %>% distinct(BeeID)
PollenIDs %>% filter(.data$BeeID %in% samples_with_rbcLOnly_genera$BeeID) %>% select(BeeID,Barcode) %>% group_by(BeeID) %>% summarize(NBarcodes=n_distinct(Barcode)) %>% group_by(NBarcodes) %>% summarize(NSamples=n_distinct(BeeID))
## # A tibble: 2 × 2
## NBarcodes NSamples
## <int> <int>
## 1 1 31
## 2 2 58
rm(samples_with_ITS2Only_genera, samples_with_rbcLOnly_genera)
# are barcode-specific genera due to a "diff-genera, same-family" type situation?
# taxonomic names in reference databases, for genera unique to each barcode
rbind(ref_its2_split_names, ref_rbcL_split_names)
## # A tibble: 18,159 × 5
## # Groups: Barcode, Family, FamGen [18,159]
## Barcode Family FamGen Genus n
## <chr> <chr> <chr> <chr> <int>
## 1 ITS2 Acanthaceae Acanthaceae_Acanthopale Acanthopale 2
## 2 ITS2 Acanthaceae Acanthaceae_Acanthopsis Acanthopsis 2
## 3 ITS2 Acanthaceae Acanthaceae_Acanthostelma Acanthostelma 1
## 4 ITS2 Acanthaceae Acanthaceae_Acanthus Acanthus 5
## 5 ITS2 Acanthaceae Acanthaceae_Achyrocalyx Achyrocalyx 1
## 6 ITS2 Acanthaceae Acanthaceae_Aechmanthera Aechmanthera 1
## 7 ITS2 Acanthaceae Acanthaceae_Ancistranthus Ancistranthus 1
## 8 ITS2 Acanthaceae Acanthaceae_Andrographis Andrographis 1
## 9 ITS2 Acanthaceae Acanthaceae_Angkalanthus Angkalanthus 1
## 10 ITS2 Acanthaceae Acanthaceae_Anisacanthus Anisacanthus 11
## # ℹ 18,149 more rows
rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_ITS2)
## # A tibble: 7 × 5
## # Groups: Barcode, Family, FamGen [7]
## Barcode Family FamGen Genus n
## <chr> <chr> <chr> <chr> <int>
## 1 ITS2 Asteraceae Asteraceae_Taraxacum Taraxacum 65
## 2 ITS2 Boraginaceae Boraginaceae_Mertensia Mertensia 5
## 3 ITS2 Brassicaceae Brassicaceae_Mummenhoffia Mummenhoffia 1
## 4 ITS2 Styracaceae Styracaceae_Halesia Halesia 2
## 5 rbcL Asteraceae Asteraceae_Taraxacum Taraxacum 9
## 6 rbcL Boraginaceae Boraginaceae_Mertensia Mertensia 1
## 7 rbcL Styracaceae Styracaceae_Halesia Halesia 2
rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_rbcL)
## # A tibble: 14 × 5
## # Groups: Barcode, Family, FamGen [14]
## Barcode Family FamGen Genus n
## <chr> <chr> <chr> <chr> <int>
## 1 ITS2 Brassicaceae Brassicaceae_Barbarea Barbarea 3
## 2 ITS2 Caprifoliaceae Caprifoliaceae_Lonicera Lonicera 52
## 3 ITS2 Elaeagnaceae Elaeagnaceae_Elaeagnus Elaeagnus 8
## 4 ITS2 Lamiaceae Lamiaceae_Glechoma Glechoma 5
## 5 ITS2 Magnoliaceae Magnoliaceae_Liriodendron Liriodendron 1
## 6 ITS2 Pinaceae Pinaceae_Pinus Pinus 40
## 7 ITS2 Salicaceae Salicaceae_Salix Salix 20
## 8 rbcL Brassicaceae Brassicaceae_Barbarea Barbarea 3
## 9 rbcL Caprifoliaceae Caprifoliaceae_Lonicera Lonicera 25
## 10 rbcL Elaeagnaceae Elaeagnaceae_Elaeagnus Elaeagnus 10
## 11 rbcL Lamiaceae Lamiaceae_Glechoma Glechoma 3
## 12 rbcL Magnoliaceae Magnoliaceae_Liriodendron Liriodendron 2
## 13 rbcL Pinaceae Pinaceae_Pinus Pinus 91
## 14 rbcL Salicaceae Salicaceae_Salix Salix 59
# what are the family names of the common genera detected by only one barcode or the other
PollenIDs %>% select(Barcode, Family, Genus) %>% arrange(Barcode, Family) %>% distinct() %>%
filter(if_else(.data$Barcode=="ITS2",
.data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_rbcL))$Family,
.data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_ITS2))$Family))
## Barcode Family Genus
## 1 ITS2 Brassicaceae Cardamine
## 2 ITS2 Brassicaceae Mummenhoffia
## 3 ITS2 Brassicaceae Thlaspi
## 4 ITS2 Brassicaceae Hesperis
## 5 ITS2 Lamiaceae Lamium
## 6 rbcL Boraginaceae Cynoglossum
## 7 rbcL Boraginaceae Pulmonaria
## 8 rbcL Brassicaceae Brassica
## 9 rbcL Brassicaceae Hesperis
## 10 rbcL Brassicaceae Barbarea
## 11 rbcL Brassicaceae Alliaria
## 12 rbcL Brassicaceae Isatis
# which samples names have mertensia (a its2 unique genus) in them
PollenIDs %>% filter(Genus=="Brassica") %>% filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% select(BeeID)
## BeeID
## 1 Bb004
## 2 Bb005
## 3 Bb011
## 4 Bb018
## 5 Bf001
## 6 Bg002
## 7 Bg003
## 8 Bg004
## 9 Bg009
## 10 Bg010
## 11 Bg011
## 12 KLS0163
## 13 KLS0201
## 14 KLS0205
temp1<-PollenIDs %>%
filter(.data$BeeID %in% (PollenIDs %>%
filter(.data$Genus %in% Genera_rbcL | .data$Genus %in% Genera_ITS2) %>%
select(BeeID) %>%
filter(.data$BeeID %in% TwoBarcodeBees$BeeID)
)$BeeID) %>%
filter(.data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_rbcL))$Family |
.data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_ITS2))$Family) %>%
group_by(BeeID, Barcode, Family) %>%
summarise(BeeID=first(BeeID), Barcode=first(Barcode), Family=first(Family), Genus=first(Genus), SampleTotalReads=first(SampleTotalReads), Reads=sum(Reads), PropReads=sum(PropReads))
## `summarise()` has grouped output by 'BeeID', 'Barcode'. You can override using
## the `.groups` argument.
# in samples where a common-but-barcode-specific genus occurs, show me the total reads assigned to any genera of the same family
temp1 %>%
ggplot(aes(y=PropReads, col=Barcode))+
geom_boxplot()+
facet_wrap(~Family)
temp2<-PollenIDs %>%
filter(.data$BeeID %in% (PollenIDs %>%
filter(.data$Genus %in% Genera_rbcL | .data$Genus %in% Genera_ITS2) %>%
select(BeeID) %>%
filter(.data$BeeID %in% TwoBarcodeBees$BeeID)
)$BeeID) %>%
filter(.data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_rbcL))$Family |
.data$Family %in% (rbind(ref_its2_split_names, ref_rbcL_split_names) %>% filter(.data$Genus %in% Genera_ITS2))$Family) %>%
group_by(BeeID, Barcode, Family) %>%
summarise(BeeID=first(BeeID), Barcode=first(Barcode), Family=first(Family), Genus=first(Genus), SampleTotalReads=first(SampleTotalReads), Reads=sum(Reads), PropReads=sum(PropReads)) %>%
group_by(BeeID, Barcode, Family) %>%
summarise(sumPropReads=sum(PropReads)) %>%
group_by(BeeID, Family) %>%
reframe(BarcodeDiff=sumPropReads[Barcode=="ITS2"]-sumPropReads[Barcode=="rbcL"])
## `summarise()` has grouped output by 'BeeID', 'Barcode'. You can override using
## the `.groups` argument.
## `summarise()` has grouped output by 'BeeID', 'Barcode'. You can override using
## the `.groups` argument.
temp2 %>%
ggplot(aes(y=BarcodeDiff))+
geom_boxplot()+
facet_wrap(~Family)
dat<-PollenIDs %>%
group_by(BeeID,Barcode)%>%
summarise(GeneraRichness=n_distinct(Genus))
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
summary(stats::aov(GeneraRichness ~ Barcode, data = dat)) # sig difference in richness detected with rbcL versus ITS2
## Df Sum Sq Mean Sq F value Pr(>F)
## Barcode 1 1297 1297.4 113.9 <2e-16 ***
## Residuals 169 1926 11.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_df<-dat %>% group_by(Barcode) %>%
summarize(mean = round(mean(GeneraRichness), 2),
err = stats::sd(GeneraRichness)/sqrt(length(GeneraRichness)),
n = n()) %>%
dplyr::mutate(label = "n") %>%
unite("n_label", label, n, sep = " = ", remove = FALSE)
ggplot(plot_df, aes(x = Barcode, y = mean, fill = Barcode)) +
geom_col(color = "black") +
geom_errorbar(aes(ymin = mean - err, ymax = mean + err), width = 0.4) +
geom_label(aes(x = Barcode, y = 0.5, label = n_label), fill="white") +
theme(legend.position = "none") +
labs(x = "Barcode",
y = "Mean Genera Richness",
title = "Genera Richness")+
theme_bw(base_size = 18)+
theme(axis.title.x=element_text("none"),
legend.position = "none")
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): no font could
## be found for family "none"
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, : no
## font could be found for family "none"
#this dataset is dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
dat<- left_join(
PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
dat.Queen %>% select(Date:NestSeek), by="BeeID") %>%
left_join(., dat.Sites, by=c("Date", "SiteID")) %>%
mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
select(Barcode, BeeID, Date, Bombus_spp, SiteID, Updated_x, Updated_y, HostGenus, CapTimeH, CapTimeM, NLCD1000_Water:NLCD1000_Crop, Prunus:Phleum)
dat<-as.data.frame(dat)
colnames(dat)
## [1] "Barcode" "BeeID" "Date"
## [4] "Bombus_spp" "SiteID" "Updated_x"
## [7] "Updated_y" "HostGenus" "CapTimeH"
## [10] "CapTimeM" "NLCD1000_Water" "NLCD1000_Developed"
## [13] "NLCD1000_Forest" "NLCD1000_Herbaceous" "NLCD1000_Crop"
## [16] "Prunus" "Dicentra" "Mertensia"
## [19] "Quercus" "Lamium" "Polemonium"
## [22] "Viola" "NA" "Cardamine"
## [25] "Potentilla" "Taraxacum" "Buglossoides"
## [28] "Medicago" "Cercis" "Acer"
## [31] "Exochorda" "Mummenhoffia" "Halesia"
## [34] "Packera" "Robinia" "Platanus"
## [37] "Geum" "Podophyllum" "Thlaspi"
## [40] "Ranunculus" "Viburnum" "Syringa"
## [43] "Crataegus" "Erigeron" "Chelidonium"
## [46] "Leucojum" "Photinia" "Paulownia"
## [49] "Fontanesia" "Hesperis" "Aesculus"
## [52] "Rosa" "Celastrus" "Lamprocapnos"
## [55] "Pulmonaria" "Aronia" "Stylophorum"
## [58] "Pyrus" "Malus" "Fagus"
## [61] "Vicia" "Berberis" "Vaccinium"
## [64] "Impatiens" "Gaylussacia" "Anemone"
## [67] "Spiraea" "Lonicera" "Pinus"
## [70] "Salix" "Anthoxanthum" "Brassica"
## [73] "Elaeagnus" "Glechoma" "Liquidambar"
## [76] "Lupinus" "Morus" "Chamaecyparis"
## [79] "Veronica" "Asimina" "Cunninghamia"
## [82] "Lolium" "Poa" "Vinca"
## [85] "Cynoglossum" "Euscaphis" "Akebia"
## [88] "Magnolia" "Ginkgo" "Picea"
## [91] "Trifolium" "Barbarea" "Dactylis"
## [94] "Fraxinus" "Wisteria" "Virgilia"
## [97] "Liriodendron" "Nyssa" "Gleditsia"
## [100] "Vitis" "Securigera" "Gelsemium"
## [103] "Pachysandra" "Scilla" "Toxicodendron"
## [106] "Ligustrum" "Cornus" "Alliaria"
## [109] "Rubus" "Isatis" "Phleum"
dat2<-as.matrix(dat %>% select(-c(Barcode:NLCD1000_Crop, 'NA')) %>% select(-any_of(singletonGenera)))
rownames(dat2)<-dat$BeeID
sort(colnames(dat2))
## [1] "Acer" "Aesculus" "Anthoxanthum" "Aronia"
## [5] "Asimina" "Barbarea" "Berberis" "Brassica"
## [9] "Buglossoides" "Cardamine" "Celastrus" "Cercis"
## [13] "Chamaecyparis" "Cornus" "Crataegus" "Cunninghamia"
## [17] "Dicentra" "Elaeagnus" "Erigeron" "Exochorda"
## [21] "Fagus" "Fontanesia" "Fraxinus" "Gaylussacia"
## [25] "Geum" "Ginkgo" "Glechoma" "Gleditsia"
## [29] "Halesia" "Hesperis" "Lamium" "Liquidambar"
## [33] "Liriodendron" "Lolium" "Lonicera" "Lupinus"
## [37] "Magnolia" "Mertensia" "Morus" "Mummenhoffia"
## [41] "Packera" "Paulownia" "Picea" "Pinus"
## [45] "Platanus" "Poa" "Podophyllum" "Polemonium"
## [49] "Potentilla" "Prunus" "Pyrus" "Quercus"
## [53] "Ranunculus" "Robinia" "Rosa" "Salix"
## [57] "Stylophorum" "Syringa" "Taraxacum" "Thlaspi"
## [61] "Trifolium" "Vaccinium" "Veronica" "Viburnum"
## [65] "Vicia" "Vinca" "Viola" "Wisteria"
dat$ShannonD<-vegan::diversity(dat2, index="shannon")
summary(stats::aov(ShannonD ~ Barcode, data = dat)) # sig difference in diversity detected with rbcL versus ITS2
## Df Sum Sq Mean Sq F value Pr(>F)
## Barcode 1 2.24 2.2414 8.207 0.0047 **
## Residuals 169 46.15 0.2731
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_df <- dat %>%
group_by(Barcode) %>%
# calculate mean and standard error of diversity
summarize(mean = round(mean(ShannonD), 2),
err = stats::sd(ShannonD)/sqrt(length(ShannonD)),
n = n()) %>%
dplyr::mutate(label = "n") %>%
unite("n_label", label, n, sep = " = ", remove = FALSE)
ggplot(plot_df, aes(x = Barcode, y = mean, fill = Barcode)) +
geom_col(color = "black") +
# scale_fill_manual(values = colorBlindness::SteppedSequential5Steps) +
geom_errorbar(aes(ymin = mean - err, ymax = mean + err), width = 0.5) +
geom_label(aes(x = Barcode, y = 0.05, label = n_label), fill="white") +
# scale_y_continuous(limits = c(0, 2.75), expand = c(0,0)) +
labs(x = "Barcode",
y = "Mean Shannon diversity",
title = "Shannon diversity")+
theme_bw(base_size = 18)+
theme(legend.position = "none")
rm(plot_df)
# visualizing how barcodes separate in multidimensional space
#dplyr::filters/selects that are useful
# filter(!str_starts(SiteID, "BEF")) %>% #remove Blandy
# select(Cercis:Phleum) %>% #select metabarcoding data
# filter(!is.na(Cercis)) %>% #remove rows without metabarcoding data
# select(-any_of(singletonGenera)) # BUT de-select the singletons
# mutations that are helpful
# mutate(across(everything(), ~as.numeric(.))))
#this dataset is dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
dat<- left_join(
PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
dat.Queen %>% select(Date:NestSeek), by="BeeID") %>%
left_join(., dat.Sites, by=c("Date", "SiteID")) %>%
mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
select(Barcode, BeeID, Date, Bombus_spp, SiteID, Name, Updated_x, Updated_y, HostGenus, CapTimeH, CapTimeM, NLCD1000_Water:NLCD1000_Crop, Prunus:Phleum)
dat2<-as.data.frame(dat %>%
ungroup() %>%
select(Prunus:Phleum) %>%
select(-c(any_of(singletonGenera), "NA")))
colnames(dat2)
## [1] "Prunus" "Dicentra" "Mertensia" "Quercus"
## [5] "Lamium" "Polemonium" "Viola" "Cardamine"
## [9] "Potentilla" "Taraxacum" "Buglossoides" "Cercis"
## [13] "Acer" "Exochorda" "Mummenhoffia" "Halesia"
## [17] "Packera" "Robinia" "Platanus" "Geum"
## [21] "Podophyllum" "Thlaspi" "Ranunculus" "Viburnum"
## [25] "Syringa" "Crataegus" "Erigeron" "Paulownia"
## [29] "Fontanesia" "Hesperis" "Aesculus" "Rosa"
## [33] "Celastrus" "Aronia" "Stylophorum" "Pyrus"
## [37] "Fagus" "Vicia" "Berberis" "Vaccinium"
## [41] "Gaylussacia" "Lonicera" "Pinus" "Salix"
## [45] "Anthoxanthum" "Brassica" "Elaeagnus" "Glechoma"
## [49] "Liquidambar" "Lupinus" "Morus" "Chamaecyparis"
## [53] "Veronica" "Asimina" "Cunninghamia" "Lolium"
## [57] "Poa" "Vinca" "Magnolia" "Ginkgo"
## [61] "Picea" "Trifolium" "Barbarea" "Fraxinus"
## [65] "Wisteria" "Liriodendron" "Gleditsia" "Cornus"
dat2mat<-as.matrix(dat2)
library(vegan)
## Loading required package: permute
## Loading required package: lattice
NMDS_barcode = metaMDS(dat2mat, k = 2, trymax = 20) #all data
## Run 0 stress 0.1764864
## Run 1 stress 0.1815978
## Run 2 stress 0.1762644
## ... New best solution
## ... Procrustes: rmse 0.0347055 max resid 0.3131506
## Run 3 stress 0.1780208
## Run 4 stress 0.1795438
## Run 5 stress 0.1774141
## Run 6 stress 0.1769081
## Run 7 stress 0.1794215
## Run 8 stress 0.1802235
## Run 9 stress 0.1797725
## Run 10 stress 0.177031
## Run 11 stress 0.1783333
## Run 12 stress 0.1772149
## Run 13 stress 0.1771803
## Run 14 stress 0.1766901
## ... Procrustes: rmse 0.04831964 max resid 0.4201105
## Run 15 stress 0.17843
## Run 16 stress 0.1804889
## Run 17 stress 0.1800755
## Run 18 stress 0.1759254
## ... New best solution
## ... Procrustes: rmse 0.03248528 max resid 0.2699901
## Run 19 stress 0.176513
## Run 20 stress 0.1788741
## *** Best solution was not repeated -- monoMDS stopping criteria:
## 12: no. of iterations >= maxit
## 8: stress ratio > sratmax
NMDS_barcode
##
## Call:
## metaMDS(comm = dat2mat, k = 2, trymax = 20)
##
## global Multidimensional Scaling using monoMDS
##
## Data: dat2mat
## Distance: bray
##
## Dimensions: 2
## Stress: 0.1759254
## Stress type 1, weak ties
## Best solution was not repeated after 20 tries
## The best solution was from try 18 (random start)
## Scaling: centring, PC rotation, halfchange scaling
## Species: expanded scores based on 'dat2mat'
stressplot(NMDS_barcode)
#goodness(NMDS_barcode)
#ordiplot(NMDS_barcode, type="text", cex=1.2)
NMDS_plot_df<- cbind(dat %>% ungroup(),
vegan::scores(NMDS_barcode, display = "sites") %>%
as.data.frame())
ggplot(NMDS_plot_df, aes(x=NMDS1, y=NMDS2, color=Barcode, shape = Barcode))+
geom_point(size = 4, alpha=0.5) +
# scale_color_manual(values = colorBlindness::SteppedSequential5Steps)+
stat_ellipse(linetype = 2, linewidth = 1) +
scale_x_continuous(limits=c(-4.5,1.5), breaks = c(-4.5,-3.5,-1,-0.5,0,0.5,1,1.5,2))+
ggbreak::scale_x_break(breaks=c(-3.5,-1), scale=4)+
# labs(title = "All Data")+
theme_bw(base_size = 18)
#Try removing samples appearing as outliers in the intital NMDS
# which(NMDS_plot_df$NMDS1>1)
# which(NMDS_plot_df$NMDS1<(-3))
# View(dat[c(21,30,34,46),])
#
# NMDS_barcode2 = metaMDS(dat2mat[-c(21,30,34,46),], k = 2, trymax = 20) #removing outliers from previous
# stressplot(NMDS_barcode2)
# NMDS_barcode2
#
# NMDS_plot_df2<- cbind(dat[-c(21,30,34,46),] %>% ungroup(),
# vegan::scores(NMDS_barcode2, display = "sites") %>%
# as.data.frame())
#
# ggplot(NMDS_plot_df2, aes(x=NMDS1, y=NMDS2, color=Barcode, shape=Barcode))+
# geom_point(size = 4, alpha=0.5) +
# # scale_color_manual(values = colorBlindness::SteppedSequential5Steps)+
# stat_ellipse(linetype = 2, linewidth = 1) +
# labs(title = "Excluding Outliers")
# Try removing samples without data from both barcodes
# dat<- left_join(
# PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
# dat.Queen %>% select(Date:NestSeek), by="BeeID") %>%
# left_join(., dat.Sites, by=c("Date", "SiteID")) %>%
# mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
# select(Barcode, BeeID, Date, Bombus_spp, SiteID, Name, Updated_x, Updated_y, HostName, HostGenus, CapTimeH, CapTimeM, NLCD1000_Water:NLCD1000_Crop, Prunus:Phleum) %>% filter(BeeID %in% both) # ADDITIONAL FILTER: restrict NMDS to just samples with data from both barcodes)
# dat2<-as.data.frame(dat %>%
# ungroup() %>%
# select(Prunus:Phleum) %>%
# select(-c(any_of(singletonGenera), "NA")))
# colnames(dat2)
# dat2mat<-as.matrix(dat2)
#
# NMDS_barcode4 = metaMDS(dat2mat, k = 2, trymax = 20) #all data
# NMDS_barcode4
# stressplot(NMDS_barcode4)
# goodness(NMDS_barcode4)
# ordiplot(NMDS_barcode4, type="text", cex=1.2)
#
# NMDS_plot_df<- cbind(dat %>% ungroup(),
# vegan::scores(NMDS_barcode4, display = "sites") %>%
# as.data.frame())
#
# ggplot(NMDS_plot_df, aes(x=NMDS1, y=NMDS2, color=Barcode))+
# geom_point(size = 4, alpha=0.5) +
# # scale_color_manual(values = colorBlindness::SteppedSequential5Steps)+
# stat_ellipse(linetype = 2, linewidth = 1) +
# labs(title = "Only Samples w/ Both Barcodes")
detach('package:vegan')
So the databases contain the same genera detected in the samples. But the average richness is higher in rbcL, as well as shannon diversity. And all three NMDS show rbcL clustering separately than ITS2.
# construct contingency table
dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, Bombus_spp, Date), by="BeeID") %>%
filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens") %>%
filter(.data$Genus %in% topGenera2$Genus) %>%
group_by(Bombus_spp, BeeID, Genus) %>% #first summarize across barcode within samples, genus
summarize(PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1))) %>%
# filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
group_by(Bombus_spp, Genus) %>% #next summarize across samples within Bombus spp, genus
summarise(PropReads=sum(PropReads)) %>%
pivot_wider(names_from = "Genus", values_from = "PropReads", values_fill = 0) %>%
as.data.frame(.)
## `summarise()` has grouped output by 'Bombus_spp', 'BeeID'. You can override
## using the `.groups` argument.
## `summarise()` has grouped output by 'Bombus_spp'. You can override using the
## `.groups` argument.
rownames(dat)<-dat$Bombus_spp
dat<-dat %>% select(-Bombus_spp) %>% drop_na()
stats::chisq.test(dat)$expected # Expected counts < 5 ==> chi square approx. maybe doubtful
## Warning in stats::chisq.test(dat): Chi-squared approximation may be incorrect
## Acer Aesculus Barbarea Cercis Elaeagnus Glechoma
## bimaculatus 0.7844243 1.5517698 0.7243491 6.745888 0.9631372 1.7911066
## griseocollis 0.2790733 0.5520704 0.2577004 2.399973 0.3426537 0.6372189
## impatiens 0.1577744 0.3121137 0.1456912 1.356828 0.1937197 0.3602525
## Halesia Lamium Liriodendron Lonicera Mertensia Mummenhoffia
## bimaculatus 2.3516963 5.375135 1.1890315 2.2500301 2.1920186 0.5701779
## griseocollis 0.8366589 1.912302 0.4230197 0.8004893 0.7798507 0.2028512
## impatiens 0.4730061 1.081122 0.2391547 0.4525576 0.4408895 0.1146822
## Pinus Prunus Quercus Salix Taraxacum Vaccinium
## bimaculatus 3.1528013 4.7091124 0.8307746 4.7794906 0.7422049 0.8301126
## griseocollis 1.1216666 1.6753527 0.2955632 1.7003911 0.2640530 0.2953277
## impatiens 0.6341356 0.9471627 0.1670970 0.9613182 0.1492827 0.1669639
## Viburnum Viola Paulownia
## bimaculatus 0.7895577 1.7715797 0.44578624
## griseocollis 0.2808996 0.6302718 0.15859660
## impatiens 0.1588069 0.3563250 0.08966278
(stats::chisq.test(dat)$expected)<5 # if > 20% of expected cell counts are less than 5, then use Fisher's exact test
## Warning in stats::chisq.test(dat): Chi-squared approximation may be incorrect
## Acer Aesculus Barbarea Cercis Elaeagnus Glechoma Halesia Lamium
## bimaculatus TRUE TRUE TRUE FALSE TRUE TRUE TRUE FALSE
## griseocollis TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## impatiens TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## Liriodendron Lonicera Mertensia Mummenhoffia Pinus Prunus Quercus
## bimaculatus TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## griseocollis TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## impatiens TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## Salix Taraxacum Vaccinium Viburnum Viola Paulownia
## bimaculatus TRUE TRUE TRUE TRUE TRUE TRUE
## griseocollis TRUE TRUE TRUE TRUE TRUE TRUE
## impatiens TRUE TRUE TRUE TRUE TRUE TRUE
#stats::chisq.test(dat)
stats::fisher.test(dat, simulate.p.value=TRUE)
## Warning in stats::fisher.test(dat, simulate.p.value = TRUE): 'x' has been
## rounded to integer: Mean relative difference: 0.1697392
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 2000 replicates)
##
## data: dat
## p-value = 0.006997
## alternative hypothesis: two.sided
# Plot the data! prop of reads assigned to top genera for each bombus spp
dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, Bombus_spp, Date), by="BeeID") %>%
mutate(FamGen=paste(Family, Genus, sep=" "),
unCommon=ifelse(
Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens",
"common","uncommon")) %>%
group_by(BeeID, Family, FamGen) %>%
summarise(BeeID=first(BeeID),
Bombus_spp=first(Bombus_spp),
unCommon=first(unCommon),
PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)),
Family=first(Family),
Genus=first(Genus),
FamGen=first(FamGen)
) %>% ungroup() %>% arrange(FamGen)
## `summarise()` has grouped output by 'BeeID', 'Family'. You can override using
## the `.groups` argument.
dat %>%
ggplot(aes(x=Bombus_spp)) +
geom_col(aes(y=PropReads,
fill=ifelse(.data$Genus %in% topGenera2$Genus, FamGen,
ifelse(!is.na(.data$Genus), "z_AllOther", NA))),
position="fill") +
scale_fill_manual(values = myColors.FamGen, na.value="gray")+ #FamGen.palette
scale_x_discrete(labels = c("pensylvanicus"="pensyl-\nvanicus", "vagans-sandersoni-perplexus" = "vagans-\nsandersoni-\nperplexus")) +
geom_label(data=dat %>%
group_by(Bombus_spp) %>%
summarize(num=n_distinct(BeeID)),
aes(x=Bombus_spp, y=-0.05, label=paste("n = ", num, sep="")))+
guides(fill=guide_legend(title = "Family Genus"))+
theme_bw(base_size = 16)+
labs(title="", y="Proportion of Sample Reads", x="Bombus species")+
theme(legend.text = element_text(size=10),
legend.position = "bottom")
# zoom in: plot the "z_AllOther" pollen families to get an idea of the taxa we're missing from the above plot.
# this plot shows the proportion of "AllOther" reads assigned to taxonomic family
dat<-PollenIDs %>% left_join(., dat.Queen %>% select(BeeID, Bombus_spp, Date), by="BeeID") %>%
group_by(BeeID, Family, Genus) %>%
summarise(BeeID=first(BeeID),
Bombus_spp=first(Bombus_spp),
PropReads=sum(PropReads)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)),
Family=first(Family),
Genus=first(Genus),
) %>% ungroup() %>%
filter(!.data$Genus %in% topGenera2$Genus,!is.na(.data$Family))
## `summarise()` has grouped output by 'BeeID', 'Family'. You can override using
## the `.groups` argument.
plot2<-dat %>%
ggplot(aes(x=Bombus_spp)) +
geom_col(aes(x=Bombus_spp, y=PropReads,
fill=forcats::fct_lump_n(Family, 19)),
position="fill") +
scale_x_discrete(labels = c("pensylvanicus"="pensyl-\nvanicus", "vagans-sandersoni-perplexus" = "vagans-\nsandersoni-\nperplexus")) +
scale_fill_manual(values=c("#990F0F", "#B22C2C" , "#CC5151" , "#E57E7E" , "#FFB2B2" , "#99540F" , "#B26F2C" , "#CC8E51" , "#E5B17E" , "#FFD8B2" , "#6B990F" , "#85B22C" , "#A3CC51" , "#C3E57E" , "#E5FFB2" , "#0F6B99" , "#2C85B2" , "#51A3CC" , "#7EC3E5" , "#B2E5FF" , "#8F7EE5", "black")) +
# scale_fill_brewer(palette = "Paired", na.value = "black")+ #FamGen.palette
guides(fill=guide_legend(title = "Family"))+
theme_bw(base_size = 16)+
labs(title="Subset 'AllOther' Taxa", ylab="Proportion of 'AllOther' Reads", x="Bombus species")+
theme(axis.text.x = element_text(size=12))+
theme(legend.position = "bottom")
#
# ggpubr::ggarrange(plot1, plot2, ncol = 1, nrow = 2)
dat<-PollenIDs %>%
left_join(., dat.Queen %>% select(BeeID, SiteID, Bombus_spp),by="BeeID") %>%
filter(!Genus %in% singletonGenera) %>% #filter to REMOVE the singleton genera
# filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
group_by(BeeID,Bombus_spp)%>%
summarise(GeneraRichness=n_distinct(Genus)) %>%
filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens") #use this when running aov() on just the common bumble bee species
## `summarise()` has grouped output by 'BeeID'. You can override using the
## `.groups` argument.
summary(stats::aov(GeneraRichness ~ Bombus_spp, data = dat)) # no sig difference in richness between 4 most common species, *****unless you exclude bees without data from BOTH barcodes
## Df Sum Sq Mean Sq F value Pr(>F)
## Bombus_spp 2 55.9 27.94 1.117 0.332
## Residuals 90 2252.0 25.02
plot_df <- dat %>%
filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"| Bombus_spp=="impatiens") %>%
# filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
group_by(Bombus_spp) %>%
# calculate mean and standard error of diversity
summarize(mean = round(mean(GeneraRichness), 2),
err = stats::sd(GeneraRichness)/sqrt(length(GeneraRichness)),
n = n()) %>%
dplyr::mutate(label = "n") %>%
unite("n_label", label, n, sep = " = ", remove = FALSE)
ggplot(plot_df, aes(x = Bombus_spp, y = mean)) +
geom_col(color = "black", aes(fill = Bombus_spp)) +
# scale_fill_manual(values = colorBlindness::Blue2DarkRed18Steps) +
geom_errorbar(aes(ymin = mean - err, ymax = mean + err), width = 0.5) +
geom_label(aes(x = Bombus_spp, y = 1, label = n_label)) +
labs(x = "Bombus species",
y = "Mean Genera Richness",
title = "All Data (incl. rbcL or ITS2 only samples)")+
theme_bw(base_size = 18)+
theme(legend.position = "none")
dat<- left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
dat.Queen %>% select(Date:NestSeek), by="BeeID") %>%
arrange(BeeID) %>%
filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
group_by(BeeID) %>% summarise(across(Prunus:Phleum, ~mean(.)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)))) %>%
select(Prunus:Phleum) %>% # select just read data
select(-any_of(singletonGenera)) # de-select the singleton genera
dat<-as.matrix(dat)
dat2<-left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
dat.Queen %>% select(Date:NestSeek), by="BeeID") %>%
left_join(., dat.Sites %>% select(SiteID, Name), by="SiteID")%>%
arrange(BeeID) %>%
filter(.data$BeeID %in% TwoBarcodeBees$BeeID) %>% #optional filter for BOTH BARCODE BEES
group_by(BeeID) %>% summarise(Bombus_spp=first(Bombus_spp), across(Prunus:Phleum, ~mean(.)/first(ifelse(.data$BeeID %in% TwoBarcodeBees$BeeID, 2,1)))) %>%
select(BeeID, Bombus_spp) %>%
ungroup()
rownames(dat)<-dat2$BeeID
# for the old version of dat and dat2
# rownames(dat)<-paste(dat2$BeeID, dat2$Barcode, sep="_")
# dat2$BeeIDBarcode<-paste(dat2$BeeID, dat2$Barcode, sep="_")
dat2$ShannonD<-vegan::diversity(dat, index="shannon")
dat2<- dat2 %>% filter(Bombus_spp=="bimaculatus"|Bombus_spp=="griseocollis"|Bombus_spp=="impatiens") #use this when running aov() on just the common bumble bee species
summary(stats::aov(ShannonD ~ Bombus_spp, data = dat2)) # WOAH THERE IS sig difference in shannon diversity of pollens collected by three common bombus spp when you only include bees with both barcodes
## Df Sum Sq Mean Sq F value Pr(>F)
## Bombus_spp 2 2.633 1.3166 4.268 0.019 *
## Residuals 54 16.659 0.3085
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot_df <- dat2 %>%
filter(Bombus_spp=="bimaculatus" | Bombus_spp=="griseocollis"| Bombus_spp=="impatiens") %>%
group_by(Bombus_spp) %>%
# calculate mean and standard error of diversity
summarize(mean = round(mean(ShannonD), 2),
err = stats::sd(ShannonD)/sqrt(length(ShannonD)),
n = n_distinct(BeeID)) %>%
dplyr::mutate(label = "n") %>%
unite("n_label", label, n, sep = " = ", remove = FALSE)
ggplot(plot_df, aes(x = Bombus_spp, y = mean)) +
geom_col(color = "black", aes(fill = Bombus_spp)) +
# scale_fill_manual(values = colorBlindness::Blue2DarkRed18Steps) +
geom_errorbar(aes(ymin = mean - err, ymax = mean + err), width = 0.5) +
geom_label(aes(x = Bombus_spp, y = 0.1, label = n_label)) +
labs(x = "Bombus_spp",
y = "Mean Shannon diversity",
title = "All Data (incl. rbcL or ITS2 only samples)") +
theme_bw(base_size = 18)+
theme(legend.position = "none")
#this dataset is like dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
dat <- left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
dat.Queen %>% select(Date:NestSeek), by="BeeID") %>%
left_join(., dat.Sites, by=c("Date", "SiteID")) %>%
mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
mutate(CapTimeHM=hms::hms(hours=CapTimeH, minutes=CapTimeM, seconds=0), JDate=julian(Date)) %>% #create variables: CapTimeHM and julian date JDate
mutate(HostGenus=ifelse(is.na(.data$HostGenus),"none",HostGenus)) %>%
select(Barcode, BeeID, JDate, Bombus_spp, SiteID, Name, Updated_x, Updated_y, HostGenus, CapTimeHM, Prunus:Phleum) %>%
rowwise() %>% mutate(max=max(c_across(Prunus:Phleum))) %>%
filter(Bombus_spp=="bimaculatus" | Bombus_spp=="griseocollis"| Bombus_spp=="impatiens") %>%
# filter(Name=="Blandy Experimental Farm") %>%
# filter(BeeID %in% TwoBarcodeBees$BeeID) %>%
ungroup()
summary(dat %>% filter(HostGenus=="Elaeagnus") %>% select(Elaeagnus))
## Elaeagnus
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.05672
## 3rd Qu.:0.03598
## Max. :0.55718
summary(dat %>% filter(HostGenus=="Glechoma") %>% select(Glechoma))
## Glechoma
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.02637
## Mean :0.09778
## 3rd Qu.:0.13449
## Max. :0.60859
summary(dat %>% filter(HostGenus=="Lamium") %>% select(Lamium))
## Lamium
## Min. :0.00000
## 1st Qu.:0.05297
## Median :0.66332
## Mean :0.51616
## 3rd Qu.:0.87530
## Max. :0.97651
# explanatory variables must be centered, standardized (if explanatory variables are in different units), transformed (to limit the skew of explanatory variables) or normalized (to linearize relationships) following the same principles as in PCA
# sample size by Bombus species and floral host
dat %>% group_by(Bombus_spp) %>% summarise(n=n_distinct(BeeID))
## # A tibble: 3 × 2
## Bombus_spp n
## <fct> <int>
## 1 bimaculatus 56
## 2 griseocollis 26
## 3 impatiens 11
dat %>% group_by(HostGenus) %>% summarise(n=n_distinct(BeeID))
## # A tibble: 17 × 2
## HostGenus n
## <chr> <int>
## 1 "" 4
## 2 "Barbarea" 5
## 3 "Cercis" 2
## 4 "Chaenomeles" 1
## 5 "Elaeagnus" 26
## 6 "Glechoma" 18
## 7 "Hellebores" 1
## 8 "Lamium" 12
## 9 "Lonicera" 1
## 10 "Malus" 3
## 11 "Mertensia" 4
## 12 "Photina" 1
## 13 "Polemonium" 1
## 14 "Prunus" 8
## 15 "Pyracantha" 4
## 16 "Salix" 1
## 17 "Spiraea" 1
# these counts incl double counts of samples with data from more than one barcode
hist(scale(as.numeric(dat$JDate)))
hist(scale(as.numeric(dat$CapTimeHM)))
dat$JDate<-scale(as.numeric(dat$JDate))
dat$CapTimeHM<-scale(as.numeric(dat$CapTimeHM))
dat$Updated_y<-scale(as.numeric(dat$Updated_y))
dat2<-as.data.frame(dat %>%
ungroup() %>%
select(Prunus:Phleum) %>%
select(-c(any_of(singletonGenera), "NA")) #remove genera detected only once
)
dat2<-as.matrix(dat2)
rownames(dat2)<-dat$BeeID
dat2<-vegan::decostand(dat2, method = "hellinger") # community data standardization
library(vegan)
# global model1 should include:
#dat2 ~ Bombus_spp + Barcode + JDate + CapTimeHM + HostGenus + Updated_y
#dat2 ~ Bombus_spp + Barcode + JDate + CapTimeHM + Updated_y (no host model)
global.RDA.step<-ordiR2step(rda(dat2~1, data=dat), scope = rda(dat2 ~ Bombus_spp + Barcode + JDate + CapTimeHM + HostGenus + Updated_y, data=dat))
## Step: R2.adj= 0
## Call: dat2 ~ 1
##
## R2.adjusted
## <All variables> 0.31347007
## + HostGenus 0.20083371
## + JDate 0.05254167
## + Barcode 0.04603774
## + Bombus_spp 0.03631868
## + Updated_y 0.02688220
## + CapTimeHM 0.00929700
## <none> 0.00000000
##
## Df AIC F Pr(>F)
## + HostGenus 16 -37.986 3.3403 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.2008337
## Call: dat2 ~ HostGenus
##
## R2.adjusted
## <All variables> 0.3134701
## + Barcode 0.2499960
## + JDate 0.2336344
## + Updated_y 0.2196355
## + Bombus_spp 0.2160232
## + CapTimeHM 0.2050629
## <none> 0.2008337
##
## Df AIC F Pr(>F)
## + Barcode 1 -46.642 9.7181 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.249996
## Call: dat2 ~ HostGenus + Barcode
##
## R2.adjusted
## <All variables> 0.3134701
## + JDate 0.2803462
## + Updated_y 0.2693801
## + Bombus_spp 0.2661645
## + CapTimeHM 0.2546322
## <none> 0.2499960
##
## Df AIC F Pr(>F)
## + JDate 1 -51.979 6.5669 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.2803462
## Call: dat2 ~ HostGenus + Barcode + JDate
##
## R2.adjusted
## <All variables> 0.3134701
## + Updated_y 0.2968662
## + Bombus_spp 0.2946883
## + CapTimeHM 0.2839004
## <none> 0.2803462
##
## Df AIC F Pr(>F)
## + Updated_y 1 -54.612 4.0778 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.2968662
## Call: dat2 ~ HostGenus + Barcode + JDate + Updated_y
##
## R2.adjusted
## <All variables> 0.3134701
## + Bombus_spp 0.3109382
## + CapTimeHM 0.3001803
## <none> 0.2968662
##
## Df AIC F Pr(>F)
## + Bombus_spp 2 -55.97 2.3274 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.3109382
## Call: dat2 ~ HostGenus + Barcode + JDate + Updated_y + Bombus_spp
##
## R2.adjusted
## + CapTimeHM 0.3134701
## <All variables> 0.3134701
## <none> 0.3109382
# sample size of model
bees<-length(unique(dat$BeeID))
records<-length(dat$BeeID)
#only bees with both barcodes, best model
#RDA_Bombus<-vegan::rda(dat2 ~ HostGenus + Barcode + Bombus_spp + Updated_y + JDate, data=dat)
#all data best model
RDA_Bombus<-vegan::rda(dat2 ~ HostGenus + Barcode + JDate + Updated_y + Bombus_spp, data=dat)
#no host best model
#RDA_Bombus<-vegan::rda(dat2 ~ Bombus_spp + Barcode + JDate + Updated_y + CapTimeHM, data=dat)
#for getting stats from best model
RsquareAdj(RDA_Bombus)
## $r.squared
## [1] 0.4080543
##
## $adj.r.squared
## [1] 0.3109382
anova.cca(RDA_Bombus, by="term")
## Permutation test for rda under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## Model: rda(formula = dat2 ~ HostGenus + Barcode + JDate + Updated_y + Bombus_spp, data = dat)
## Df Variance F Pr(>F)
## HostGenus 16 0.25034 3.8740 0.001 ***
## Barcode 1 0.04272 10.5775 0.001 ***
## JDate 1 0.02770 6.8584 0.001 ***
## Updated_y 1 0.01681 4.1611 0.001 ***
## Bombus_spp 2 0.01880 2.3274 0.002 **
## Residual 128 0.51697
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(RDA_Bombus)[c(7:9,11)]
## $tot.chi
## [1] 0.8733337
##
## $constr.chi
## [1] 0.3563675
##
## $unconst.chi
## [1] 0.5169662
##
## $concont
## $concont$importance
## Importance of components:
## RDA1 RDA2 RDA3 RDA4 RDA5 RDA6 RDA7
## Eigenvalue 0.07344 0.06484 0.04732 0.04091 0.03229 0.01874 0.01521
## Proportion Explained 0.20609 0.18196 0.13278 0.11480 0.09061 0.05259 0.04268
## Cumulative Proportion 0.20609 0.38805 0.52083 0.63562 0.72624 0.77883 0.82151
## RDA8 RDA9 RDA10 RDA11 RDA12 RDA13
## Eigenvalue 0.01298 0.01186 0.01048 0.007264 0.005068 0.004534
## Proportion Explained 0.03642 0.03328 0.02942 0.020383 0.014221 0.012723
## Cumulative Proportion 0.85793 0.89122 0.92064 0.941021 0.955242 0.967965
## RDA14 RDA15 RDA16 RDA17 RDA18 RDA19
## Eigenvalue 0.002976 0.002335 0.002217 0.001436 0.0009808 0.0007399
## Proportion Explained 0.008352 0.006552 0.006220 0.004029 0.0027523 0.0020763
## Cumulative Proportion 0.976317 0.982870 0.989090 0.993119 0.9958708 0.9979472
## RDA20 RDA21
## Eigenvalue 0.0004455 0.0002861
## Proportion Explained 0.0012500 0.0008028
## Cumulative Proportion 0.9991972 1.0000000
#vegan::ordiplot(RDA_Bombus, scaling=1, type="text")
# Scaling 1 shows similarities between objects in the response matrix.
# Sites (numbers) that are closer together have more similar communities.
# Species that are closer together occupy more sites in common.
RDA_plot_df<-vegan::ordiplot(RDA_Bombus, scaling=2, type = "text")
# Scaling 2 shows the effects of explanatory variables.
# Longer arrows mean this variable strongly drives the variation in the community matrix.
# Arrows pointing in opposite directions have a negative relationship.
# Arrows pointing in the same direction have a positive relationship.
#an RDA with host, County, Barcode, Date, and Bombus species can explain ~30% of the total variation in pollen community composition.
#grabbing data from rda model (or the ordiplot thereof) for plotting in ggplot
sites.long <- BiodiversityR::sites.long(RDA_plot_df, env.data=dat)
species.long <- BiodiversityR::species.long(RDA_plot_df)
species.long2<- species.long %>%
mutate(diff=sqrt(abs(axis1)^2+abs(axis2)^2)) %>%
filter(diff>0.15) %>%
mutate(labels=paste("Pollen_",labels,sep=""))
axis.long <- BiodiversityR::axis.long(RDA_Bombus, choices=c(1, 2))
arrows <- as.data.frame(vegan::scores(RDA_Bombus)$biplot) #
arrows$names<-rownames(arrows)
arrows <- arrows %>% # select only Host and Barcode arrows greater than 0.2 units from 0,0
mutate(diff=sqrt(abs(RDA1)^2+abs(RDA2)^2)) %>%
filter(
!str_starts(names, "Bombus"), #Bombus will be visualized with color
!str_starts(names,"Barcode") #Barcode will be visualized with shape
# diff>0.15
) %>%
rowwise() %>% mutate(
names=ifelse(str_starts(names, "Host"),
paste("Host_", strsplit(names, split="HostGenus")[[1]][2], sep=""),
ifelse(str_starts(names, "Barcode"),
strsplit(names, split="Barcode")[[1]][2],
names)) # making names pretty
) %>%
mutate( # create variable to flag and filter hosts with only 1 obs, keep everything else
keep=ifelse(names %in% ((dat %>% group_by(HostGenus) %>% summarise(n=n_distinct(BeeID)) %>% filter(n>1) %>% mutate(HostGenus=paste("Host_",HostGenus,sep="")))$HostGenus), 1,
ifelse(!str_starts(names,"Host"), 1, 0))) %>% filter(keep==1) %>% select(-keep) # remove filter variable
colnames(arrows)<-c("axis1","axis2","labels","diff")
rbind(species.long2, arrows)
## axis1 axis2 labels diff
## Prunus -0.11288000 0.52102630 Pollen_Prunus 0.5331138
## Mertensia -0.17866428 0.09942979 Pollen_Mertensia 0.2044681
## Lamium 0.35279099 0.39807614 Pollen_Lamium 0.5319080
## Cercis -0.79234535 -0.16051723 Pollen_Cercis 0.8084411
## Acer -0.01027997 0.16465866 Pollen_Acer 0.1649793
## Platanus 0.08641551 -0.16057704 Pollen_Platanus 0.1823530
## Lonicera 0.10495084 -0.25183148 Pollen_Lonicera 0.2728255
## Pinus 0.24795259 -0.24706315 Pollen_Pinus 0.3500296
## Salix 0.22120681 -0.28329771 Pollen_Salix 0.3594302
## Elaeagnus 0.05751494 -0.17683115 Pollen_Elaeagnus 0.1859495
## 1 0.19182521 -0.02551423 Host_Barbarea 0.1935146
## 2 -0.42554002 -0.11812127 Host_Cercis 0.4416299
## 3 -0.05601890 -0.40583902 Host_Elaeagnus 0.4096870
## 4 -0.01229819 0.10829750 Host_Glechoma 0.1089935
## 5 0.41803372 0.33924760 Host_Lamium 0.5383689
## 6 0.11946913 -0.16560654 Host_Malus 0.2042019
## 7 -0.53335114 -0.08278345 Host_Mertensia 0.5397375
## 8 -0.13443660 0.44378379 Host_Prunus 0.4636995
## 9 0.07244374 -0.05486005 Host_Pyracantha 0.0908720
## 10 0.21647657 -0.71850933 JDate 0.7504117
## 11 -0.31734438 -0.36164683 Updated_y 0.4811402
plot1<- ggplot() +
geom_vline(xintercept = c(0), color = "grey70", linetype = 2) +
geom_hline(yintercept = c(0), color = "grey70", linetype = 2) +
labs(x=axis.long[1, "label"], y=axis.long[2, "label"]) +
scale_x_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
scale_y_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
geom_point(data=sites.long,
aes(x=axis1, y=axis2, color=Bombus_spp, shape=Barcode),
size=5, alpha=0.7) +
# scale_shape_manual(values=c(16, 17)) +
# stat_ellipse(data=sites.long, aes(x=axis1, y=axis2, colour=Bombus_spp), linewidth = 1) +
geom_point(data=species.long, aes(x=axis1, y=axis2), size=3, alpha=0.7) +
ggrepel::geom_text_repel(data=rbind(species.long2, arrows), aes(x = axis1, y = axis2, label = labels), direction = "both", size=5, box.padding = 0.75) +
geom_segment(data=arrows, aes(x = 0, xend = axis1, y = 0, yend = axis2), arrow = arrow(length = unit(0.25, "cm")), colour="black") +
coord_fixed(ratio=1) +
theme_bw(base_size = 18)+
labs(title=paste("Bombus species differences (n = ",records," records from ",bees," bees)", sep=""))+
theme(axis.title = element_text(size=18),
axis.text=element_text(size=14))
plot1
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
detach('package:vegan')
#whether or not you exclude samples without both barcodes, the model tells the same story
We also and fit a mixed effects linear model with four land use variables (i.e., Forest, Herb, Urban, Crop), survey date and sample collection time, using the lme function of the nlme package.
#this dataset is dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
# environmental variables dataset
dat<- left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
dat.Queen %>% select(Date:NestSeek), by="BeeID") %>%
left_join(., dat.Sites, by=c("Date", "SiteID")) %>%
mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
mutate(CapTimeHM=hms::hms(hours=CapTimeH, minutes=CapTimeM, seconds=0), JDate=julian(Date)) %>% #create variables
select(Barcode, BeeID, SiteID, Name,
JDate, CapTimeHM,
Updated_y, NLCD1000_Water:NLCD1000_Crop,
Prunus:Phleum,
-"NA") %>%
# filter(BeeID %in% TwoBarcodeBees$BeeID) %>%
filter(Name!="Blandy Experimental Farm") %>% #remove Blandy bees
ungroup()
dat
## # A tibble: 76 × 107
## Barcode BeeID SiteID Name JDate CapTimeHM Updated_y NLCD1000_Water
## <chr> <fct> <fct> <chr> <drt> <time> <dbl> <dbl>
## 1 ITS2 CKC0001 GMCP1 Gertrude E. … 1908… 14:25 38.9 0
## 2 ITS2 ESE0004 DTP2 Darden Towe … 1909… 13:04 38.0 0.0194
## 3 ITS2 KLS0007 ICF1 Ivy Creek Fo… 1907… 14:51 38.1 0.0988
## 4 ITS2 KLS0044 VHVG1 Vint Hill Vi… 1908… 12:04 38.7 0.000578
## 5 ITS2 KLS0045 VHVG1 Vint Hill Vi… 1908… 12:06 38.7 0.000578
## 6 ITS2 KLS0052 GMCP1 Gertrude E. … 1908… 13:38 38.9 0
## 7 ITS2 KLS0054 GMCP1 Gertrude E. … 1908… 14:15 38.9 0
## 8 ITS2 KLS0055 GMCP1 Gertrude E. … 1908… 14:16 38.9 0
## 9 ITS2 KLS0071 SP2 Seilheimer P… 1909… 09:56 38.0 0.0271
## 10 ITS2 KLS0096 ICF2 Ivy Creek Fo… 1909… 09:52 38.1 0.0988
## # ℹ 66 more rows
## # ℹ 99 more variables: NLCD1000_Developed <dbl>, NLCD1000_Forest <dbl>,
## # NLCD1000_Herbaceous <dbl>, NLCD1000_Crop <dbl>, Prunus <dbl>,
## # Dicentra <dbl>, Mertensia <dbl>, Quercus <dbl>, Lamium <dbl>,
## # Polemonium <dbl>, Viola <dbl>, Cardamine <dbl>, Potentilla <dbl>,
## # Taraxacum <dbl>, Buglossoides <dbl>, Medicago <dbl>, Cercis <dbl>,
## # Acer <dbl>, Exochorda <dbl>, Mummenhoffia <dbl>, Halesia <dbl>, …
# does shannon diversity of pollen samples differ among sites? across land use contexts?
dat<-as.data.frame(dat)
colnames(dat)
## [1] "Barcode" "BeeID" "SiteID"
## [4] "Name" "JDate" "CapTimeHM"
## [7] "Updated_y" "NLCD1000_Water" "NLCD1000_Developed"
## [10] "NLCD1000_Forest" "NLCD1000_Herbaceous" "NLCD1000_Crop"
## [13] "Prunus" "Dicentra" "Mertensia"
## [16] "Quercus" "Lamium" "Polemonium"
## [19] "Viola" "Cardamine" "Potentilla"
## [22] "Taraxacum" "Buglossoides" "Medicago"
## [25] "Cercis" "Acer" "Exochorda"
## [28] "Mummenhoffia" "Halesia" "Packera"
## [31] "Robinia" "Platanus" "Geum"
## [34] "Podophyllum" "Thlaspi" "Ranunculus"
## [37] "Viburnum" "Syringa" "Crataegus"
## [40] "Erigeron" "Chelidonium" "Leucojum"
## [43] "Photinia" "Paulownia" "Fontanesia"
## [46] "Hesperis" "Aesculus" "Rosa"
## [49] "Celastrus" "Lamprocapnos" "Pulmonaria"
## [52] "Aronia" "Stylophorum" "Pyrus"
## [55] "Malus" "Fagus" "Vicia"
## [58] "Berberis" "Vaccinium" "Impatiens"
## [61] "Gaylussacia" "Anemone" "Spiraea"
## [64] "Lonicera" "Pinus" "Salix"
## [67] "Anthoxanthum" "Brassica" "Elaeagnus"
## [70] "Glechoma" "Liquidambar" "Lupinus"
## [73] "Morus" "Chamaecyparis" "Veronica"
## [76] "Asimina" "Cunninghamia" "Lolium"
## [79] "Poa" "Vinca" "Cynoglossum"
## [82] "Euscaphis" "Akebia" "Magnolia"
## [85] "Ginkgo" "Picea" "Trifolium"
## [88] "Barbarea" "Dactylis" "Fraxinus"
## [91] "Wisteria" "Virgilia" "Liriodendron"
## [94] "Nyssa" "Gleditsia" "Vitis"
## [97] "Securigera" "Gelsemium" "Pachysandra"
## [100] "Scilla" "Toxicodendron" "Ligustrum"
## [103] "Cornus" "Alliaria" "Rubus"
## [106] "Isatis" "Phleum"
dat2<-as.matrix(dat %>% select(Prunus:Phleum) %>% select(-any_of(singletonGenera)))
rownames(dat2)<-dat$BeeID
dat$ShannonD<-vegan::diversity(dat2, index="shannon")
hist(dat$ShannonD)
hist(as.numeric(dat$JDate))
hist(dat$NLCD1000_Developed)
hist(scale(dat$ShannonD))
hist(scale(as.numeric(dat$JDate)))
hist(scale(dat$NLCD1000_Developed))
polldivers.lmm <- nlme::lme(ShannonD~scale(NLCD1000_Developed)+scale(NLCD1000_Forest)+scale(NLCD1000_Herbaceous)+scale(NLCD1000_Crop)+scale(as.numeric(JDate))+scale(CapTimeHM), random=~1|Name, data=dat)
plot(polldivers.lmm)
summary(polldivers.lmm) #conditional t-test testing the marginal significance of each fixed effect coefficient with the other fixed effects in the model
## Linear mixed-effects model fit by REML
## Data: dat
## AIC BIC logLik
## 129.5641 149.671 -55.78204
##
## Random effects:
## Formula: ~1 | Name
## (Intercept) Residual
## StdDev: 0.06317034 0.4533868
##
## Fixed effects: ShannonD ~ scale(NLCD1000_Developed) + scale(NLCD1000_Forest) + scale(NLCD1000_Herbaceous) + scale(NLCD1000_Crop) + scale(as.numeric(JDate)) + scale(CapTimeHM)
## Value Std.Error DF t-value p-value
## (Intercept) 0.7074114 0.0557090 58 12.698321 0.0000
## scale(NLCD1000_Developed) 0.2851386 0.8096789 11 0.352163 0.7314
## scale(NLCD1000_Forest) 0.3825655 0.7648605 11 0.500177 0.6268
## scale(NLCD1000_Herbaceous) 0.1904430 0.5330449 11 0.357274 0.7276
## scale(NLCD1000_Crop) 0.1234263 0.2944109 11 0.419231 0.6831
## scale(as.numeric(JDate)) 0.1525481 0.0567882 58 2.686263 0.0094
## scale(CapTimeHM) 0.0096755 0.0679871 58 0.142314 0.8873
## Correlation:
## (Intr) s(NLCD1000_D s(NLCD1000_F s(NLCD1000_H
## scale(NLCD1000_Developed) 0.025
## scale(NLCD1000_Forest) 0.024 0.996
## scale(NLCD1000_Herbaceous) 0.019 0.992 0.993
## scale(NLCD1000_Crop) 0.028 0.979 0.978 0.967
## scale(as.numeric(JDate)) 0.006 -0.201 -0.201 -0.225
## scale(CapTimeHM) -0.002 0.414 0.442 0.453
## s(NLCD1000_C s(.(JD
## scale(NLCD1000_Developed)
## scale(NLCD1000_Forest)
## scale(NLCD1000_Herbaceous)
## scale(NLCD1000_Crop)
## scale(as.numeric(JDate)) -0.165
## scale(CapTimeHM) 0.421 -0.021
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -1.7826417 -0.7307585 -0.1646942 0.7885067 1.9223805
##
## Number of Observations: 76
## Number of Groups: 16
stats::anova(polldivers.lmm) #conditional F-test testing the significance of terms in the fixed effects model (sequentially by default)
## numDF denDF F-value p-value
## (Intercept) 1 58 160.90101 <.0001
## scale(NLCD1000_Developed) 1 11 2.78950 0.1231
## scale(NLCD1000_Forest) 1 11 1.06107 0.3251
## scale(NLCD1000_Herbaceous) 1 11 0.38391 0.5481
## scale(NLCD1000_Crop) 1 11 0.76080 0.4017
## scale(as.numeric(JDate)) 1 58 7.23560 0.0093
## scale(CapTimeHM) 1 58 0.02025 0.8873
summary<-summary(polldivers.lmm)
summary$tTable
## Value Std.Error DF t-value p-value
## (Intercept) 0.707411359 0.05570905 58 12.6983207 2.161841e-18
## scale(NLCD1000_Developed) 0.285138582 0.80967888 11 0.3521626 7.313706e-01
## scale(NLCD1000_Forest) 0.382565510 0.76486054 11 0.5001768 6.268057e-01
## scale(NLCD1000_Herbaceous) 0.190442951 0.53304490 11 0.3572738 7.276469e-01
## scale(NLCD1000_Crop) 0.123426281 0.29441093 11 0.4192313 6.831171e-01
## scale(as.numeric(JDate)) 0.152548109 0.05678823 58 2.6862628 9.409566e-03
## scale(CapTimeHM) 0.009675501 0.06798714 58 0.1423137 8.873255e-01
plot(dat$JDate,dat$ShannonD)
dat %>%
ggplot(aes(x=JDate, y=ShannonD))+
geom_point(size=4)+
stat_smooth(method=stats::lm, se=F)+
theme_bw(base_size = 18)+
labs(title="Pollen diversity over time",
y="Shannon diversity index",
x="Julian date")
## Don't know how to automatically pick scale for object of type <difftime>.
## Defaulting to continuous.
## `geom_smooth()` using formula = 'y ~ x'
#this dataset is dat.Queens plus dat.Sites AND it also keeps data from the two barcodes separate
# environmental variables dataset
dat<- left_join(PollenIDgenera.sum %>% pivot_wider(., names_from = "Genus", values_from = "PropReads"),
dat.Queen %>% select(Date:NestSeek), by="BeeID") %>%
left_join(., dat.Sites, by=c("Date", "SiteID")) %>%
mutate(across(Prunus:Phleum, ~replace(., is.na(.), 0))) %>%
mutate(CapTimeHM=hms::hms(hours=CapTimeH, minutes=CapTimeM, seconds=0), JDate=julian(Date)) %>% #create variables
select(Barcode, BeeID, SiteID, Name,
JDate, CapTimeHM,
Updated_y, NLCD1000_Water:NLCD1000_Crop,
Prunus:Phleum,
-"NA") %>%
# filter(BeeID %in% TwoBarcodeBees$BeeID) %>% #optional
filter(Name!="Blandy Experimental Farm") %>% #remove Blandy bees (REQUIRED)
ungroup()
# explanatory variables must be centered, standardized (if explanatory variables are in different units), transformed (to limit the skew of explanatory variables) or normalized (to linearize relationships) following the same principles as in PCA
# global model2 should include:
#NLCD1000_Forest + NLCD1000_Herbaceous + NLCD1000_Crop + NLCD1000_Developed + Barcode + Updated_y + JDate + CapTimeHM
#viewing distribution of explanatory, environmental variables
hist(dat$NLCD1000_Herbaceous)
hist(dat$Updated_y)
hist(as.numeric(dat$JDate))
hist(scale(dat$NLCD1000_Herbaceous))
hist(scale(dat$Updated_y))
hist(scale(as.numeric(dat$JDate)))
# save center and scale of original Latitude (for plotting later, optional)
# center<-attr(scale(dat$Updated_y), "scaled:center")
# scale<-attr(scale(dat$Updated_y), "scaled:scale")
#transformations & standardization of environmental variables
dat<-dat %>% mutate(across(NLCD1000_Water:NLCD1000_Crop, ~scale(.)),
) %>% as.data.frame()
dat$JDate<-scale(as.numeric(dat$JDate))
dat$Updated_y<-scale(as.numeric(dat$Updated_y))
# pollen community dataset
dat2<-as.data.frame(dat %>%
ungroup() %>%
select(Prunus:Phleum) %>%
select(-any_of(singletonGenera))
)
dat2<-as.matrix(dat2)
rownames(dat2)<-dat$BeeID
#standardization of pollen community data
dat2<-vegan::decostand(dat2,method = "hellinger") # community data standardization
library(vegan)
global.RDA.step<-ordiR2step(rda(dat2~1, data=dat), scope = rda(dat2 ~ NLCD1000_Forest + NLCD1000_Herbaceous + NLCD1000_Crop + NLCD1000_Developed + CapTimeHM + JDate + Updated_y + Barcode, data=dat))
## Step: R2.adj= 0
## Call: dat2 ~ 1
##
## R2.adjusted
## <All variables> 0.269073139
## + JDate 0.084544164
## + NLCD1000_Crop 0.053466365
## + Barcode 0.046724941
## + NLCD1000_Developed 0.041377666
## + Updated_y 0.032432806
## + NLCD1000_Forest 0.021171414
## + CapTimeHM 0.009438771
## + NLCD1000_Herbaceous 0.007206977
## <none> 0.000000000
##
## Df AIC F Pr(>F)
## + JDate 1 -18.232 7.9264 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.08454416
## Call: dat2 ~ JDate
##
## R2.adjusted
## <All variables> 0.26907314
## + NLCD1000_Crop 0.14437415
## + Updated_y 0.13793315
## + NLCD1000_Developed 0.12312361
## + Barcode 0.11881977
## + NLCD1000_Forest 0.10685527
## + CapTimeHM 0.09442550
## + NLCD1000_Herbaceous 0.09065659
## <none> 0.08454416
##
## Df AIC F Pr(>F)
## + NLCD1000_Crop 1 -22.403 6.1745 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.1443741
## Call: dat2 ~ JDate + NLCD1000_Crop
##
## R2.adjusted
## <All variables> 0.2690731
## + Updated_y 0.1955759
## + NLCD1000_Developed 0.1792995
## + Barcode 0.1791862
## + NLCD1000_Forest 0.1718648
## + CapTimeHM 0.1549744
## + NLCD1000_Herbaceous 0.1543438
## <none> 0.1443741
##
## Df AIC F Pr(>F)
## + Updated_y 1 -26.141 5.6465 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.1955759
## Call: dat2 ~ JDate + NLCD1000_Crop + Updated_y
##
## R2.adjusted
## <All variables> 0.2690731
## + Barcode 0.2313515
## + NLCD1000_Herbaceous 0.2059329
## + NLCD1000_Developed 0.2057908
## + NLCD1000_Forest 0.1998819
## + CapTimeHM 0.1989565
## <none> 0.1955759
##
## Df AIC F Pr(>F)
## + Barcode 1 -28.662 4.3511 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.2313515
## Call: dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode
##
## R2.adjusted
## <All variables> 0.2690731
## + NLCD1000_Herbaceous 0.2412230
## + NLCD1000_Developed 0.2406660
## + NLCD1000_Forest 0.2362107
## + CapTimeHM 0.2352266
## <none> 0.2313515
##
## Df AIC F Pr(>F)
## + NLCD1000_Herbaceous 1 -28.722 1.9237 0.028 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.241223
## Call: dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Herbaceous
##
## R2.adjusted
## <All variables> 0.2690731
## + CapTimeHM 0.2475644
## + NLCD1000_Forest 0.2474796
## + NLCD1000_Developed 0.2474379
## <none> 0.2412230
##
## Df AIC F Pr(>F)
## + CapTimeHM 1 -28.453 1.59 0.046 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Step: R2.adj= 0.2475644
## Call: dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Herbaceous + CapTimeHM
##
## R2.adjusted
## <All variables> 0.2690731
## + NLCD1000_Forest 0.2554377
## + NLCD1000_Developed 0.2552923
## <none> 0.2475644
##
## Df AIC F Pr(>F)
## + NLCD1000_Forest 1 -28.362 1.7296 0.062 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#sample size calc
records<-dim(dat)[1] # rows
bees<-n_distinct(dat$BeeID) # bees
sites<-n_distinct(dat$Name) # sites
#best model of full dataset
RDA_LandCover<-vegan::rda(dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Herbaceous, data=dat) #n=76 datapoints from 52 bees from 16 sites
#best model of subset (i.e., remove samples without both barcodes) n=48 datapoints from 24 bees from 12 sites
# RDA_LandCover<-vegan::rda(dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Developed + NLCD1000_Forest + NLCD1000_Herbaceous , data=dat)
RsquareAdj(RDA_LandCover)
## $r.squared
## [1] 0.2918081
##
## $adj.r.squared
## [1] 0.241223
anova.cca(RDA_LandCover, by="term")
## Permutation test for rda under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## Model: rda(formula = dat2 ~ JDate + NLCD1000_Crop + Updated_y + Barcode + NLCD1000_Herbaceous, data = dat)
## Df Variance F Pr(>F)
## JDate 1 0.08101 9.5631 0.001 ***
## NLCD1000_Crop 1 0.05898 6.9626 0.001 ***
## Updated_y 1 0.05071 5.9861 0.001 ***
## Barcode 1 0.03734 4.4077 0.001 ***
## NLCD1000_Herbaceous 1 0.01630 1.9237 0.043 *
## Residual 70 0.59299
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(RDA_LandCover)[c(7:9,11)]
## $tot.chi
## [1] 0.8373357
##
## $constr.chi
## [1] 0.2443414
##
## $unconst.chi
## [1] 0.5929944
##
## $concont
## $concont$importance
## Importance of components:
## RDA1 RDA2 RDA3 RDA4 RDA5
## Eigenvalue 0.1063 0.05512 0.04741 0.02338 0.01212
## Proportion Explained 0.4351 0.22557 0.19405 0.09567 0.04959
## Cumulative Proportion 0.4351 0.66069 0.85474 0.95041 1.00000
#anova.cca(RDA_LandCover, by="axis")
#vegan::ordiplot(RDA_LandCover, scaling=1, type="text")
# Scaling 1 shows similarities between objects in the response matrix.
# Sites (numbers) that are closer together have more similar communities.
# Species that are closer together occupy more sites in common.
RDA_plot_df<-vegan::ordiplot(RDA_LandCover, scaling=2, type = "text")
# Scaling 2 shows the effects of explanatory variables.
# Longer arrows mean this variable strongly drives the variation in the community matrix.
# Arrows pointing in opposite directions have a negative relationship.
# Arrows pointing in the same direction have a positive relationship.
#an RDA with x,x,x can explain about xx.x% of the total variation in pollen community composition. anova.cca() generate a significant p-value for this difference
#grabbing data from rda model (or the ordiplot thereof) for plotting in ggplot
#grabbing data from rda model (or the ordiplot thereof) for plotting in ggplot
sites.long <- BiodiversityR::sites.long(RDA_plot_df, env.data=dat)
species.long <- BiodiversityR::species.long(RDA_plot_df)
species.long2<- species.long %>%
mutate(diff=sqrt(abs(axis1)^2+abs(axis2)^2)) %>%
filter(diff>0.15) # select pollen species greater than X.XX units from origin (0,0)
axis.long <- BiodiversityR::axis.long(RDA_LandCover, choices=c(1, 2))
arrows <- as.data.frame(vegan::scores(RDA_LandCover)$biplot) #
arrows$names<-rownames(arrows)
arrows <- arrows %>% # select non-County arrows greater than 0.2 units from 0,0
mutate(diff=sqrt(abs(RDA1)^2+abs(RDA2)^2)) %>%
filter(!str_starts(names,"Barcode"), #represented as color and shape
diff>0.15)
colnames(arrows)<-c("axis1","axis2","labels","diff")
rbind(species.long2, arrows)
## axis1 axis2 labels diff
## Prunus -0.773127745 0.27097049 Prunus 0.8192384
## Lamium 0.338261530 0.52213164 Lamium 0.6221272
## Cercis -0.008520592 -0.27520275 Cercis 0.2753346
## Acer -0.258346269 -0.15696354 Acer 0.3022918
## Vaccinium 0.048628194 -0.16651499 Vaccinium 0.1734703
## Pinus 0.306146504 0.01898303 Pinus 0.3067345
## Salix 0.222940702 -0.07908980 Salix 0.2365539
## JDate 0.812812428 -0.19866066 JDate 0.8367378
## NLCD1000_Crop 0.283497851 0.79727300 NLCD1000_Crop 0.8461769
## Updated_y -0.095481519 0.16985197 Updated_y 0.1948497
## NLCD1000_Herbaceous 0.162970704 -0.06617047 NLCD1000_Herbaceous 0.1758920
#sites.long$Updated_y_descaled<-as.numeric(unlist(ggfortify::unscale(dat$Updated_y, center=center, scale=scale)))
plot2<-ggplot() +
geom_vline(xintercept = c(0), color = "grey70", linetype = 2) +
geom_hline(yintercept = c(0), color = "grey70", linetype = 2) +
labs(x=axis.long[1, "label"], y=axis.long[2, "label"]) +
scale_x_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
scale_y_continuous(sec.axis = dup_axis(labels=NULL, name=NULL)) +
geom_point(data=sites.long, aes(x=axis1, y=axis2, shape=Barcode, color=Barcode), size=5, alpha=0.7) +
geom_point(data=species.long, aes(x=axis1, y=axis2), size=3, alpha=0.7) +
ggrepel::geom_text_repel(data=rbind(species.long2, arrows), aes(x = axis1, y = axis2, label = labels), size=5, box.padding = 0.75, direction = "both") +
geom_segment(data=arrows, aes(x=0, xend=axis1, y=0, yend=axis2), arrow=arrow(length=unit(0.25, "cm")), colour="black") +
coord_fixed(ratio=1) +
theme_bw(base_size = 18)+
labs(title=paste("Land cover differences (n=", records, " records from ", bees, " bees, ", sites, " sites)", sep=""))
plot2
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggpubr::ggarrange(plot1, plot2, ncol = 2, nrow = 1)
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
# total queens observed
dat<-dat.Sites%>%filter(Name!="Blandy Experimental Farm")
sum(dat.Sites$TotalCaptured + dat.Sites$TotalCaptured) #total
## [1] 1166
sum(dat$TotalCaptured + dat$TotalCaptured) #excluding blandy
## [1] 690
# bombus queen richness
unique(dat.Queen$Bombus_spp)
## [1] fervidus bimaculatus
## [3] griseocollis vagans-sandersoni-perplexus
## [5] impatiens pensylvanicus
## [7] auricomus
## 7 Levels: auricomus bimaculatus fervidus griseocollis ... vagans-sandersoni-perplexus
# by species, a count of bees, date of first detect, first pollen, first nest
left_join(
dat.Queen %>%
group_by(Bombus_spp) %>%
summarise(
Total=n(),
FirstDetect=min(Date)
),
dat.Queen %>%
group_by(Bombus_spp) %>%
filter(CarryPollen==1) %>%
summarise(
CarryPollen=n(),
FirstPollen=min(Date)
),
by="Bombus_spp") %>%
left_join(.,
dat.Queen %>%
group_by(Bombus_spp) %>%
filter(NestSeek==1) %>%
summarise(
NestSeek=n(),
FirstNestSeek=min(Date))
,
by="Bombus_spp") %>%
left_join(.,
dat.Queen %>%
group_by(Bombus_spp) %>%
filter(Forage==1) %>%
summarise(
Forage=n(),
FirstForage=min(Date))
,
by="Bombus_spp")
## # A tibble: 7 × 9
## Bombus_spp Total FirstDetect CarryPollen FirstPollen NestSeek
## <fct> <int> <dttm> <int> <dttm> <int>
## 1 auricomus 3 2022-04-28 00:00:00 3 2022-04-28 00:00:00 NA
## 2 bimaculatus 56 2022-03-26 00:00:00 56 2022-03-26 00:00:00 NA
## 3 fervidus 8 2022-04-25 00:00:00 8 2022-04-25 00:00:00 NA
## 4 griseocoll… 26 2022-04-16 00:00:00 26 2022-04-16 00:00:00 NA
## 5 impatiens 11 2022-03-22 00:00:00 11 2022-03-22 00:00:00 1
## 6 pensylvani… 1 2022-04-24 00:00:00 1 2022-04-24 00:00:00 NA
## 7 vagans-san… 2 2022-04-13 00:00:00 2 2022-04-13 00:00:00 NA
## # ℹ 3 more variables: FirstNestSeek <dttm>, Forage <int>, FirstForage <dttm>
dat.Queen %>% group_by(CarryPollen,Bombus_spp) %>% #number of queens with pollen by species
summarise(Count=n())
## `summarise()` has grouped output by 'CarryPollen'. You can override using the
## `.groups` argument.
## # A tibble: 7 × 3
## # Groups: CarryPollen [1]
## CarryPollen Bombus_spp Count
## <int> <fct> <int>
## 1 1 auricomus 3
## 2 1 bimaculatus 56
## 3 1 fervidus 8
## 4 1 griseocollis 26
## 5 1 impatiens 11
## 6 1 pensylvanicus 1
## 7 1 vagans-sandersoni-perplexus 2
#filter out blandy
dat.Queen %>% filter(!str_starts(SiteID, "BEF")) %>%
group_by(Bombus_spp) %>% #number of queens with pollen by species
summarise(Count=n())
## # A tibble: 6 × 2
## Bombus_spp Count
## <fct> <int>
## 1 bimaculatus 30
## 2 fervidus 5
## 3 griseocollis 7
## 4 impatiens 7
## 5 pensylvanicus 1
## 6 vagans-sandersoni-perplexus 2
dat.Queen %>% filter(!str_starts(SiteID, "BEF")) %>%
group_by(CarryPollen,Bombus_spp)%>% #number of queens with pollen by species
summarise(Count=n())
## `summarise()` has grouped output by 'CarryPollen'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups: CarryPollen [1]
## CarryPollen Bombus_spp Count
## <int> <fct> <int>
## 1 1 bimaculatus 30
## 2 1 fervidus 5
## 3 1 griseocollis 7
## 4 1 impatiens 7
## 5 1 pensylvanicus 1
## 6 1 vagans-sandersoni-perplexus 2
dat.Queen %>% filter(!str_starts(SiteID, "BEF")) %>%
group_by(NestSeek, Bombus_spp)%>% #number of queens nest seeking
summarise(Count=n())
## `summarise()` has grouped output by 'NestSeek'. You can override using the
## `.groups` argument.
## # A tibble: 7 × 3
## # Groups: NestSeek [2]
## NestSeek Bombus_spp Count
## <dbl> <fct> <int>
## 1 0 bimaculatus 30
## 2 0 fervidus 5
## 3 0 griseocollis 7
## 4 0 impatiens 6
## 5 0 pensylvanicus 1
## 6 0 vagans-sandersoni-perplexus 2
## 7 1 impatiens 1
dat.Sites %>% select(SiteID, bimaculatus:pensylvanicus) %>%
pivot_longer(cols=c(bimaculatus:pensylvanicus), names_to = "Bombus_spp", values_to = "count") %>%
filter(count>0) %>%
group_by(SiteID) %>%
summarize(BeeRichness=n()) %>%
left_join(., dat.Sites, by="SiteID") %>%
select(Name, SiteID, County, Type, Updated_x, Updated_y, Date, FloralRichness, Bees_60min, BeeRichness)
## # A tibble: 70 × 10
## Name SiteID County Type Updated_x Updated_y Date
## <chr> <fct> <chr> <chr> <dbl> <dbl> <dttm>
## 1 Blandy Experimen… BEF_01 Clarke Loca… -78.1 39.1 2022-03-30 00:00:00
## 2 Blandy Experimen… BEF_02 Clarke Loca… -78.1 39.1 2022-03-31 00:00:00
## 3 Blandy Experimen… BEF_03 Clarke Loca… -78.1 39.1 2022-04-01 00:00:00
## 4 Blandy Experimen… BEF_04 Clarke Loca… -78.1 39.1 2022-04-04 00:00:00
## 5 Blandy Experimen… BEF_06 Clarke Loca… -78.1 39.1 2022-04-08 00:00:00
## 6 Blandy Experimen… BEF_09 Clarke Loca… -78.1 39.1 2022-04-11 00:00:00
## 7 Blandy Experimen… BEF_11 Clarke Loca… -78.1 39.1 2022-04-13 00:00:00
## 8 Blandy Experimen… BEF_12 Clarke Loca… -78.1 39.1 2022-04-14 00:00:00
## 9 Blandy Experimen… BEF_14 Clarke Loca… -78.1 39.1 2022-04-15 00:00:00
## 10 Blandy Experimen… BEF_15 Clarke Loca… -78.1 39.1 2022-04-20 00:00:00
## # ℹ 60 more rows
## # ℹ 3 more variables: FloralRichness <dbl>, Bees_60min <dbl>, BeeRichness <int>
dat<-dat.Sites %>% select(Name, Updated_x, Updated_y) %>% distinct()
dist<-raster::pointDistance(dat[, c("Updated_x", "Updated_y")], lonlat=TRUE)
dist[dist==0]<-NA
mean(dist, na.rm = T)
## [1] 55327.82
min(dist, na.rm = T)
## [1] 1751.81
max(dist, na.rm = T)
## [1] 141190.9
rm(dist)